diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-21 21:16:54 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-22 02:30:50 +0200 |
commit | f1971eb673c55afe9836484e91715200410af5bb (patch) | |
tree | 4bba48436b89a140461e579be77a43f6dde9a70a | |
parent | Crawler: Add basic indexer communication (diff) | |
download | OSSE-f1971eb673c55afe9836484e91715200410af5bb.tar.gz OSSE-f1971eb673c55afe9836484e91715200410af5bb.tar.bz2 OSSE-f1971eb673c55afe9836484e91715200410af5bb.zip |
Crawler: Implement basic async functionality
-rw-r--r-- | Cargo.lock | 293 | ||||
-rw-r--r-- | crawler/Cargo.toml | 1 | ||||
-rw-r--r-- | crawler/src/main.rs | 84 |
3 files changed, 285 insertions, 93 deletions
diff --git a/Cargo.lock b/Cargo.lock index 2121ec9..5520569 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,13 +9,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57a7559404a7f3573127aab53c08ce37a6c6a315c374a31070f3c91cd1b4a7fe" dependencies = [ "bitflags", - "bytes", + "bytes 1.2.1", "futures-core", "futures-sink", "log", "memchr", - "pin-project-lite", - "tokio", + "pin-project-lite 0.2.9", + "tokio 1.21.2", "tokio-util", ] @@ -33,7 +33,7 @@ dependencies = [ "base64", "bitflags", "brotli", - "bytes", + "bytes 1.2.1", "bytestring", "derive_more", "encoding_rs", @@ -48,7 +48,7 @@ dependencies = [ "local-channel", "mime", "percent-encoding", - "pin-project-lite", + "pin-project-lite 0.2.9", "rand 0.8.5", "sha1", "smallvec", @@ -86,7 +86,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ea16c295198e958ef31930a6ef37d0fb64e9ca3b6116e6b93a8bdae96ee1000" dependencies = [ "futures-core", - "tokio", + "tokio 1.21.2", ] [[package]] @@ -100,10 +100,10 @@ dependencies = [ "actix-utils", "futures-core", "futures-util", - "mio", + "mio 0.8.4", "num_cpus", "socket2", - "tokio", + "tokio 1.21.2", "tracing", ] @@ -115,7 +115,7 @@ checksum = "3b894941f818cfdc7ccc4b9e60fa7e53b5042a2e8567270f9147d5591893373a" dependencies = [ "futures-core", "paste", - "pin-project-lite", + "pin-project-lite 0.2.9", ] [[package]] @@ -125,7 +125,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e491cbaac2e7fc788dfff99ff48ef317e23b3cf63dbaf7aaab6418f40f92aa94" dependencies = [ "local-waker", - "pin-project-lite", + "pin-project-lite 0.2.9", ] [[package]] @@ -144,9 +144,9 @@ dependencies = [ "actix-utils", "actix-web-codegen", "ahash", - "bytes", + "bytes 1.2.1", "bytestring", - "cfg-if", + "cfg-if 1.0.0", "cookie", "derive_more", "encoding_rs", @@ -158,7 +158,7 @@ dependencies = [ "log", "mime", "once_cell", - "pin-project-lite", + "pin-project-lite 0.2.9", "regex", "serde", "serde_json", @@ -290,6 +290,12 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" + +[[package]] +name = "bytes" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" @@ -300,7 +306,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b6a75fd3048808ef06af5cd79712be8111960adaf89d90250974b38fc3928a" dependencies = [ - "bytes", + "bytes 1.2.1", ] [[package]] @@ -314,6 +320,12 @@ dependencies = [ [[package]] name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" @@ -373,6 +385,7 @@ dependencies = [ "reqwest", "scraper", "serde", + "tokio 0.2.25", ] [[package]] @@ -381,7 +394,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -477,7 +490,7 @@ version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -530,6 +543,22 @@ dependencies = [ ] [[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +dependencies = [ + "bitflags", + "fuchsia-zircon-sys", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" + +[[package]] name = "futf" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -582,7 +611,7 @@ dependencies = [ "futures-io", "futures-task", "memchr", - "pin-project-lite", + "pin-project-lite 0.2.9", "pin-utils", "slab", ] @@ -621,7 +650,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi 0.9.0+wasi-snapshot-preview1", ] @@ -632,7 +661,7 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi 0.11.0+wasi-snapshot-preview1", ] @@ -643,7 +672,7 @@ version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" dependencies = [ - "bytes", + "bytes 1.2.1", "fnv", "futures-core", "futures-sink", @@ -651,7 +680,7 @@ dependencies = [ "http", "indexmap", "slab", - "tokio", + "tokio 1.21.2", "tokio-util", "tracing", ] @@ -691,7 +720,7 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ - "bytes", + "bytes 1.2.1", "fnv", "itoa 1.0.4", ] @@ -702,9 +731,9 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ - "bytes", + "bytes 1.2.1", "http", - "pin-project-lite", + "pin-project-lite 0.2.9", ] [[package]] @@ -725,7 +754,7 @@ version = "0.14.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" dependencies = [ - "bytes", + "bytes 1.2.1", "futures-channel", "futures-core", "futures-util", @@ -735,9 +764,9 @@ dependencies = [ "httparse", "httpdate", "itoa 1.0.4", - "pin-project-lite", + "pin-project-lite 0.2.9", "socket2", - "tokio", + "tokio 1.21.2", "tower-service", "tracing", "want", @@ -749,10 +778,10 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ - "bytes", + "bytes 1.2.1", "hyper", "native-tls", - "tokio", + "tokio 1.21.2", "tokio-native-tls", ] @@ -790,7 +819,16 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", ] [[package]] @@ -839,6 +877,16 @@ dependencies = [ ] [[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] name = "language-tags" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -890,7 +938,7 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -942,6 +990,25 @@ dependencies = [ [[package]] name = "mio" +version = "0.6.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" +dependencies = [ + "cfg-if 0.1.10", + "fuchsia-zircon", + "fuchsia-zircon-sys", + "iovec", + "kernel32-sys", + "libc", + "log", + "miow 0.2.2", + "net2", + "slab", + "winapi 0.2.8", +] + +[[package]] +name = "mio" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" @@ -953,6 +1020,50 @@ dependencies = [ ] [[package]] +name = "mio-named-pipes" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" +dependencies = [ + "log", + "mio 0.6.23", + "miow 0.3.7", + "winapi 0.3.9", +] + +[[package]] +name = "mio-uds" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" +dependencies = [ + "iovec", + "libc", + "mio 0.6.23", +] + +[[package]] +name = "miow" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" +dependencies = [ + "kernel32-sys", + "net2", + "winapi 0.2.8", + "ws2_32-sys", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] name = "native-tls" version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -971,6 +1082,17 @@ dependencies = [ ] [[package]] +name = "net2" +version = "0.2.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d0df99cfcd2530b2e694f6e17e7f37b8e26bb23983ac530c0c97408837c631" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "winapi 0.3.9", +] + +[[package]] name = "new_debug_unreachable" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1014,7 +1136,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12fc0523e3bd51a692c8850d075d74dc062ccf251c0110668cbd921917118a13" dependencies = [ "bitflags", - "cfg-if", + "cfg-if 1.0.0", "foreign-types", "libc", "once_cell", @@ -1068,7 +1190,7 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dc9e0dc2adc1c69d09143aff38d3d30c5c3f0df0dad82e6d25547af174ebec0" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "redox_syscall", "smallvec", @@ -1162,6 +1284,12 @@ dependencies = [ [[package]] name = "pin-project-lite" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" + +[[package]] +name = "pin-project-lite" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" @@ -1327,7 +1455,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1337,7 +1465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" dependencies = [ "base64", - "bytes", + "bytes 1.2.1", "encoding_rs", "futures-core", "futures-util", @@ -1353,11 +1481,11 @@ dependencies = [ "native-tls", "once_cell", "percent-encoding", - "pin-project-lite", + "pin-project-lite 0.2.9", "serde", "serde_json", "serde_urlencoded", - "tokio", + "tokio 1.21.2", "tokio-native-tls", "tower-service", "url", @@ -1522,7 +1650,7 @@ version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cpufeatures", "digest", ] @@ -1564,7 +1692,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1616,12 +1744,12 @@ version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "fastrand", "libc", "redox_syscall", "remove_dir_all", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1676,21 +1804,56 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6703a273949a90131b290be1fe7b039d0fc884aa1935860dfcbe056f28cd8092" +dependencies = [ + "bytes 0.5.6", + "fnv", + "futures-core", + "iovec", + "lazy_static", + "libc", + "memchr", + "mio 0.6.23", + "mio-named-pipes", + "mio-uds", + "num_cpus", + "pin-project-lite 0.1.12", + "signal-hook-registry", + "slab", + "tokio-macros", + "winapi 0.3.9", +] + +[[package]] +name = "tokio" version = "1.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ "autocfg", - "bytes", + "bytes 1.2.1", "libc", "memchr", - "mio", + "mio 0.8.4", "num_cpus", "parking_lot", - "pin-project-lite", + "pin-project-lite 0.2.9", "signal-hook-registry", "socket2", - "winapi", + "winapi 0.3.9", +] + +[[package]] +name = "tokio-macros" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -1700,7 +1863,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" dependencies = [ "native-tls", - "tokio", + "tokio 1.21.2", ] [[package]] @@ -1709,11 +1872,11 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" dependencies = [ - "bytes", + "bytes 1.2.1", "futures-core", "futures-sink", - "pin-project-lite", - "tokio", + "pin-project-lite 0.2.9", + "tokio 1.21.2", "tracing", ] @@ -1729,9 +1892,9 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "log", - "pin-project-lite", + "pin-project-lite 0.2.9", "tracing-core", ] @@ -1840,7 +2003,7 @@ version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "wasm-bindgen-macro", ] @@ -1865,7 +2028,7 @@ version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "js-sys", "wasm-bindgen", "web-sys", @@ -1912,6 +2075,12 @@ dependencies = [ [[package]] name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + +[[package]] +name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" @@ -1921,6 +2090,12 @@ dependencies = [ ] [[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + +[[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2038,7 +2213,17 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ - "winapi", + "winapi 0.3.9", +] + +[[package]] +name = "ws2_32-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" +dependencies = [ + "winapi 0.2.8", + "winapi-build", ] [[package]] diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 486729a..cd828ad 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -11,6 +11,7 @@ reqwest = {version = "0.11", features = ["blocking", "json"]} scraper = "0.12.0" itertools = "0.10.5" serde = { version = "1.0", features = ["derive"] } +tokio = { version = "0.2.22", features = ["full"] } [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index c086a76..fdb6623 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -2,7 +2,8 @@ use itertools::Itertools; use reqwest::blocking::{Client, Response}; use serde::Serialize; -fn main() { +#[tokio::main] +async fn main() { println!("Hello, world! Im the crawler!"); let root_urls = include_str!("../top-1000-websites.txt"); @@ -10,55 +11,60 @@ fn main() { let http_client = reqwest::blocking::Client::new(); - crawler(&http_client, root_urls); + crawler(http_client, root_urls).await; } -//takes list of strings - multithread here? -fn crawler(http_client: &Client, root_urls: Vec<&str>) { +//TODO: crawling depth? - async http client +async fn crawler(http_client: Client, root_urls: Vec<&str>) { println!("Starting to crawl!"); //add root urls to queue - TODO: max q size let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new(); - for url in root_urls { - crawling_queue.push(String::from(url)); - } + root_urls + .into_iter() + .for_each(|u| crawling_queue.push(String::from(u))); //and start crawling - //FIXME: Async! loop { - //blocks - let url = crawling_queue.pop(); - - let crawl_res = crawl_url(http_client, url.as_str()); - if crawl_res.is_err() { - println!("Error crawling {}", url); - continue; - } - - let (content, crawled_urls) = crawl_res.unwrap(); - - //println!("Content: {:?}", content); - println!("Next urls: {:?}", crawled_urls); - - //push content to index - let indexer_res = push_crawl_entry_to_indexer( - http_client, - String::from("http://127.0.0.1:4444/resource"), - url, - content, - ) - .unwrap() - .text(); - - println!("Pushed to indexer {:?}", &indexer_res); - - for url in crawled_urls { - crawling_queue.push(url); - } + //even if we clone, the underlying queue implementation is still shared + let crawling_queue = crawling_queue.clone(); + let http_client = http_client.clone(); + tokio::spawn(async move { + //blocks + let url = crawling_queue.pop(); + + let crawl_res = crawl_url(&http_client, url.as_str()).await; + if crawl_res.is_err() { + println!("Error crawling {}", url); + return; + } + + let (content, crawled_urls) = crawl_res.unwrap(); + + //println!("Content: {:?}", content); + println!("Next urls: {:?}", crawled_urls); + + //push content to index + let indexer_res = push_crawl_entry_to_indexer( + &http_client, + String::from("http://127.0.0.1:4444/resource"), + url, + content, + ) + .await + .unwrap() + .text(); + + println!("Pushed to indexer {:?}", &indexer_res); + + crawled_urls + .into_iter() + .for_each(|u| crawling_queue.push(u)); + }); } } -fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> { +async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> { let url = "https://".to_owned() + url; println!("Crawling {:?}", url); @@ -103,7 +109,7 @@ fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), S Ok((response_text, next_urls)) } -fn push_crawl_entry_to_indexer( +async fn push_crawl_entry_to_indexer( http_client: &Client, indexer_url: String, url: String, |