wrenner/crawler.wren at main

 // crawler.wren
 import "requests" for Requests, Response
 class Crawler {
   construct new(baseUrl) {
     if (!baseUrl.endsWith("/")) {
       baseUrl = baseUrl + "/"
     }
     _baseUrl = baseUrl
     _toVisit = [baseUrl]
     _visited = {}
     _inFlight = 0
     _startTime = System.clock
   }
   run() {
     System.print("Starting crawler on base URL: %(_baseUrl)")
     crawlNext_() // Start the first batch of requests
     // The main event loop for the crawler. Keep yielding to the C host
     // as long as there is work to do. This keeps the fiber alive.
     while (_inFlight > 0 || _toVisit.count > 0) {
       Fiber.yield()
     }
     // Once the loop finishes, all crawling is done.
     var duration = System.clock - _startTime
     System.print("Crawling finished in %(duration.toString) seconds.")
     System.print("%(_visited.count) pages crawled.")
     Host.signalDone() // Signal the C host to exit
   }
   crawlNext_() {
     // Throttle requests to be a good web citizen.
     var maxInFlight = 10
     while (_toVisit.count > 0 && _inFlight < maxInFlight) {
       var url = _toVisit.removeAt(0)
       if (_visited.containsKey(url)) {
         continue
       }
       _visited[url] = true
       _inFlight = _inFlight + 1
       System.print("Crawling: %(url) (In flight: %(_inFlight))")
       Requests.get(url, null, Fn.new {|err, res|
         handleResponse_(err, res, url)
       })
     }
   }
   handleResponse_(err, res, url) {
     _inFlight = _inFlight - 1
     System.print("Finished: %(url) (In flight: %(_inFlight))")
     if (err != null) {
       System.print("Error crawling %(url): %(err)")
       crawlNext_() // A slot opened up, try to crawl more.
       return
     }
     if (res.statusCode >= 400) {
       System.print("Failed to crawl %(url) - Status: %(res.statusCode)")
       crawlNext_() // A slot opened up, try to crawl more.
       return
     }
     var body = res.body
     findLinks_(body, url)
     crawlNext_() // A slot opened up, try to crawl more.
   }
   findLinks_(html, pageUrl) {
     // Use the native replace and split methods.
     var links = html.replace("href=\"", "href=\"\n").split("\n")
     for (line in links) {
         if (line.contains("\"")) {
             var parts = line.split("\"")
             if (parts.count > 1) {
                 var link = parts[0]
                 addUrl_(link, pageUrl)
             }
         }
     }
   }
   addUrl_(link, pageUrl) {
     // Ignore mailto, anchors, and other schemes
     if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return
     var newUrl = ""
     if (link.startsWith("http://") || link.startsWith("https://")) {
       newUrl = link
     } else if (link.startsWith("/")) {
       // Handle absolute paths
       var uri = parseUri_(_baseUrl)
       newUrl = "%(uri["scheme"])://%(uri["host"])%(link)"
     } else {
       // Handle relative paths
       var lastSlash = lastIndexOf_(pageUrl, "/")
       var base = pageUrl[0..lastSlash]
       newUrl = "%(base)%(link)"
     }
     // Normalize URL to handle ".." and "."
     newUrl = normalizeUrl_(newUrl)
     // Only crawl URLs that are within the base URL's scope and haven't been seen.
     if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) {
       _toVisit.add(newUrl)
     }
   }
   parseUri_(url) {
     var parts = {}
     var schemeEnd = url.indexOf("://")
     parts["scheme"] = url[0...schemeEnd]
     var hostStart = schemeEnd + 3
     var hostEnd = url.indexOf("/", hostStart)
     if (hostEnd == -1) hostEnd = url.count
     parts["host"] = url[hostStart...hostEnd]
     return parts
   }
   normalizeUrl_(url) {
     var parts = url.split("/")
     var stack = []
     for (part in parts) {
       if (part == "" || part == ".") continue
       if (part == "..") {
         if (stack.count > 0) stack.removeAt(-1)
       } else {
         stack.add(part)
       }
     }
     var result = stack.join("/")
     // This is a bit of a hack to fix the double slashes after the scheme
     if (url.startsWith("http")) {
         return result.replace(":/", "://")
     }
     return result
   }
   // Helper method to find the last index of a substring.
   lastIndexOf_(str, search) {
     var index = -1
     var lastFound = -1
     var start = 0
     while (true) {
       // The native indexOf will abort if the start index is out of bounds.
       // Since the VM's implementation is strict, we add a guard here.
       if (start >= str.count) return lastFound
       index = str.indexOf(search, start)
       if (index == -1) return lastFound
       lastFound = index
       start = index + 1
     }
     return lastFound
   }
 }
 // This class is used to signal the C host application to terminate.
 class Host {
   foreign static signalDone()
 }
 // The main entry point for our script.
 var mainFiber = Fiber.new {
   var crawler = Crawler.new("https://molodetz.nl")
   crawler.run()
 }