// crawler.wren import "requests" for Requests, Response class Crawler { construct new(baseUrl) { if (!baseUrl.endsWith("/")) { baseUrl = baseUrl + "/" } _baseUrl = baseUrl _toVisit = [baseUrl] _visited = {} _inFlight = 0 _startTime = System.clock } run() { System.print("Starting crawler on base URL: %(_baseUrl)") crawlNext_() // Start the first batch of requests // The main event loop for the crawler. Keep yielding to the C host // as long as there is work to do. This keeps the fiber alive. while (_inFlight > 0 || _toVisit.count > 0) { Fiber.yield() } // Once the loop finishes, all crawling is done. var duration = System.clock - _startTime System.print("Crawling finished in %(duration.toString) seconds.") System.print("%(_visited.count) pages crawled.") Host.signalDone() // Signal the C host to exit } crawlNext_() { // Throttle requests to be a good web citizen. var maxInFlight = 4 while (_toVisit.count > 0 && _inFlight < maxInFlight) { var url = _toVisit.removeAt(0) if (_visited.containsKey(url)) { continue } _visited[url] = true _inFlight = _inFlight + 1 System.print("Crawling: %(url) (In flight: %(_inFlight))") Requests.get(url, null, Fn.new {|err, res| handleResponse_(err, res, url) }) } } handleResponse_(err, res, url) { _inFlight = _inFlight - 1 System.print("Finished: %(url) (In flight: %(_inFlight))") if (err != null) { System.print("Error crawling %(url): %(err)") crawlNext_() // A slot opened up, try to crawl more. return } if (res.statusCode >= 400) { System.print("Failed to crawl %(url) - Status: %(res.statusCode)") crawlNext_() // A slot opened up, try to crawl more. return } // The response body is already a string, no need to call toString(). var body = res.body findLinks_(body, url) crawlNext_() // A slot opened up, try to crawl more. } findLinks_(html, pageUrl) { // A simple but effective way to find href attributes. // This looks for `href="` followed by any characters that are not a quote. var links = html.replace("href=\"", "href=\"\n").split("\n") for (line in links) { if (line.contains("\"")) { var parts = line.split("\"") if (parts.count > 1) { var link = parts[0] addUrl_(link, pageUrl) } } } } addUrl_(link, pageUrl) { // Ignore mailto, anchors, and other schemes if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return var newUrl = "" if (link.startsWith("http://") || link.startsWith("https://")) { newUrl = link } else if (link.startsWith("/")) { // Handle absolute paths var uri = parseUri_(_baseUrl) newUrl = "%(uri["scheme"])://%(uri["host"])%(link)" } else { // Handle relative paths var lastSlash = pageUrl.lastIndexOf("/") var base = pageUrl[0..lastSlash] newUrl = "%(base)/%(link)" } // Normalize URL to handle ".." and "." newUrl = normalizeUrl_(newUrl) // Only crawl URLs that are within the base URL's scope and haven't been seen. if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) { _toVisit.add(newUrl) } } parseUri_(url) { var parts = {} var schemeEnd = url.indexOf("://") parts["scheme"] = url[0...schemeEnd] var hostStart = schemeEnd + 3 var hostEnd = url.indexOf("/", hostStart) if (hostEnd == -1) hostEnd = url.count parts["host"] = url[hostStart...hostEnd] return parts } normalizeUrl_(url) { var parts = url.split("/") var stack = [] for (part in parts) { if (part == "" || part == ".") continue if (part == "..") { if (stack.count > 0) stack.removeAt(-1) } else { stack.add(part) } } var result = stack.join("/") // This is a bit of a hack to fix the double slashes after the scheme if (url.startsWith("http")) { return result.replace(":/", "://") } return result } } // This class is used to signal the C host application to terminate. // The C code will look for this static method. class Host { foreign static signalDone() } // The main entry point for our script. var mainFiber = Fiber.new { var crawler = Crawler.new("https://molodetz.nl") crawler.run() }