diff --git a/crawler.wren b/crawler.wren new file mode 100644 index 0000000..c95e49f --- /dev/null +++ b/crawler.wren @@ -0,0 +1,176 @@ +// crawler.wren +import "requests" for Requests, Response + +class Crawler { + construct new(baseUrl) { + if (!baseUrl.endsWith("/")) { + baseUrl = baseUrl + "/" + } + _baseUrl = baseUrl + _toVisit = [baseUrl] + _visited = {} + _inFlight = 0 + _startTime = System.clock + } + + run() { + System.print("Starting crawler on base URL: %(_baseUrl)") + crawlNext_() // Start the first batch of requests + + // The main event loop for the crawler. Keep yielding to the C host + // as long as there is work to do. This keeps the fiber alive. + while (_inFlight > 0 || _toVisit.count > 0) { + Fiber.yield() + } + + // Once the loop finishes, all crawling is done. + var duration = System.clock - _startTime + System.print("Crawling finished in %(duration.toString) seconds.") + System.print("%(_visited.count) pages crawled.") + Host.signalDone() // Signal the C host to exit + } + + crawlNext_() { + // Throttle requests to be a good web citizen. + var maxInFlight = 10 + while (_toVisit.count > 0 && _inFlight < maxInFlight) { + var url = _toVisit.removeAt(0) + if (_visited.containsKey(url)) { + continue + } + + _visited[url] = true + _inFlight = _inFlight + 1 + System.print("Crawling: %(url) (In flight: %(_inFlight))") + + Requests.get(url, null, Fn.new {|err, res| + handleResponse_(err, res, url) + }) + } + } + + handleResponse_(err, res, url) { + _inFlight = _inFlight - 1 + System.print("Finished: %(url) (In flight: %(_inFlight))") + + if (err != null) { + System.print("Error crawling %(url): %(err)") + crawlNext_() // A slot opened up, try to crawl more. + return + } + + if (res.statusCode >= 400) { + System.print("Failed to crawl %(url) - Status: %(res.statusCode)") + crawlNext_() // A slot opened up, try to crawl more. + return + } + + var body = res.body + findLinks_(body, url) + crawlNext_() // A slot opened up, try to crawl more. + } + + findLinks_(html, pageUrl) { + // Use the native replace and split methods. + var links = html.replace("href=\"", "href=\"\n").split("\n") + + for (line in links) { + if (line.contains("\"")) { + var parts = line.split("\"") + if (parts.count > 1) { + var link = parts[0] + addUrl_(link, pageUrl) + } + } + } + } + + addUrl_(link, pageUrl) { + // Ignore mailto, anchors, and other schemes + if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return + + var newUrl = "" + if (link.startsWith("http://") || link.startsWith("https://")) { + newUrl = link + } else if (link.startsWith("/")) { + // Handle absolute paths + var uri = parseUri_(_baseUrl) + newUrl = "%(uri["scheme"])://%(uri["host"])%(link)" + } else { + // Handle relative paths + var lastSlash = lastIndexOf_(pageUrl, "/") + var base = pageUrl[0..lastSlash] + newUrl = "%(base)%(link)" + } + + // Normalize URL to handle ".." and "." + newUrl = normalizeUrl_(newUrl) + + // Only crawl URLs that are within the base URL's scope and haven't been seen. + if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) { + _toVisit.add(newUrl) + } + } + + parseUri_(url) { + var parts = {} + var schemeEnd = url.indexOf("://") + parts["scheme"] = url[0...schemeEnd] + var hostStart = schemeEnd + 3 + var hostEnd = url.indexOf("/", hostStart) + if (hostEnd == -1) hostEnd = url.count + parts["host"] = url[hostStart...hostEnd] + return parts + } + + normalizeUrl_(url) { + var parts = url.split("/") + var stack = [] + for (part in parts) { + if (part == "" || part == ".") continue + if (part == "..") { + if (stack.count > 0) stack.removeAt(-1) + } else { + stack.add(part) + } + } + + var result = stack.join("/") + // This is a bit of a hack to fix the double slashes after the scheme + if (url.startsWith("http")) { + return result.replace(":/", "://") + } + return result + } + + // Helper method to find the last index of a substring. + lastIndexOf_(str, search) { + var index = -1 + var lastFound = -1 + var start = 0 + while (true) { + // The native indexOf will abort if the start index is out of bounds. + // Since the VM's implementation is strict, we add a guard here. + if (start >= str.count) return lastFound + + index = str.indexOf(search, start) + if (index == -1) return lastFound + + lastFound = index + start = index + 1 + } + return lastFound + } +} + +// This class is used to signal the C host application to terminate. +class Host { + foreign static signalDone() +} + +// The main entry point for our script. +var mainFiber = Fiber.new { + var crawler = Crawler.new("https://molodetz.nl") + crawler.run() +} +