Crawler.

2025-07-29 15:14:51 +02:00 · 2025-07-29 15:14:51 +02:00 · da5d5cc3f4
commit da5d5cc3f4
parent e01148ba37
1 changed files with 176 additions and 0 deletions
--- a/crawler.wren
+++ b/crawler.wren
@ -0,0 +1,176 @@
+// crawler.wren
+import "requests" for Requests, Response
+
+class Crawler {
+  construct new(baseUrl) {
+    if (!baseUrl.endsWith("/")) {
+      baseUrl = baseUrl + "/"
+    }
+    _baseUrl = baseUrl
+    _toVisit = [baseUrl]
+    _visited = {}
+    _inFlight = 0
+    _startTime = System.clock
+  }
+
+  run() {
+    System.print("Starting crawler on base URL: %(_baseUrl)")
+    crawlNext_() // Start the first batch of requests
+
+    // The main event loop for the crawler. Keep yielding to the C host
+    // as long as there is work to do. This keeps the fiber alive.
+    while (_inFlight > 0 || _toVisit.count > 0) {
+      Fiber.yield()
+    }
+
+    // Once the loop finishes, all crawling is done.
+    var duration = System.clock - _startTime
+    System.print("Crawling finished in %(duration.toString) seconds.")
+    System.print("%(_visited.count) pages crawled.")
+    Host.signalDone() // Signal the C host to exit
+  }
+
+  crawlNext_() {
+    // Throttle requests to be a good web citizen.
+    var maxInFlight = 10
+    while (_toVisit.count > 0 && _inFlight < maxInFlight) {
+      var url = _toVisit.removeAt(0)
+      if (_visited.containsKey(url)) {
+        continue
+      }
+
+      _visited[url] = true
+      _inFlight = _inFlight + 1
+      System.print("Crawling: %(url) (In flight: %(_inFlight))")
+
+      Requests.get(url, null, Fn.new {|err, res|
+        handleResponse_(err, res, url)
+      })
+    }
+  }
+
+  handleResponse_(err, res, url) {
+    _inFlight = _inFlight - 1
+    System.print("Finished: %(url) (In flight: %(_inFlight))")
+
+    if (err != null) {
+      System.print("Error crawling %(url): %(err)")
+      crawlNext_() // A slot opened up, try to crawl more.
+      return
+    }
+
+    if (res.statusCode >= 400) {
+      System.print("Failed to crawl %(url) - Status: %(res.statusCode)")
+      crawlNext_() // A slot opened up, try to crawl more.
+      return
+    }
+
+    var body = res.body
+    findLinks_(body, url)
+    crawlNext_() // A slot opened up, try to crawl more.
+  }
+
+  findLinks_(html, pageUrl) {
+    // Use the native replace and split methods.
+    var links = html.replace("href=\"", "href=\"\n").split("\n")
+    
+    for (line in links) {
+        if (line.contains("\"")) {
+            var parts = line.split("\"")
+            if (parts.count > 1) {
+                var link = parts[0]
+                addUrl_(link, pageUrl)
+            }
+        }
+    }
+  }
+
+  addUrl_(link, pageUrl) {
+    // Ignore mailto, anchors, and other schemes
+    if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return
+
+    var newUrl = ""
+    if (link.startsWith("http://") || link.startsWith("https://")) {
+      newUrl = link
+    } else if (link.startsWith("/")) {
+      // Handle absolute paths
+      var uri = parseUri_(_baseUrl)
+      newUrl = "%(uri["scheme"])://%(uri["host"])%(link)"
+    } else {
+      // Handle relative paths
+      var lastSlash = lastIndexOf_(pageUrl, "/")
+      var base = pageUrl[0..lastSlash]
+      newUrl = "%(base)%(link)"
+    }
+
+    // Normalize URL to handle ".." and "."
+    newUrl = normalizeUrl_(newUrl)
+
+    // Only crawl URLs that are within the base URL's scope and haven't been seen.
+    if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) {
+      _toVisit.add(newUrl)
+    }
+  }
+
+  parseUri_(url) {
+    var parts = {}
+    var schemeEnd = url.indexOf("://")
+    parts["scheme"] = url[0...schemeEnd]
+    var hostStart = schemeEnd + 3
+    var hostEnd = url.indexOf("/", hostStart)
+    if (hostEnd == -1) hostEnd = url.count
+    parts["host"] = url[hostStart...hostEnd]
+    return parts
+  }
+
+  normalizeUrl_(url) {
+    var parts = url.split("/")
+    var stack = []
+    for (part in parts) {
+      if (part == "" || part == ".") continue
+      if (part == "..") {
+        if (stack.count > 0) stack.removeAt(-1)
+      } else {
+        stack.add(part)
+      }
+    }
+    
+    var result = stack.join("/")
+    // This is a bit of a hack to fix the double slashes after the scheme
+    if (url.startsWith("http")) {
+        return result.replace(":/", "://")
+    }
+    return result
+  }
+  
+  // Helper method to find the last index of a substring.
+  lastIndexOf_(str, search) {
+    var index = -1
+    var lastFound = -1
+    var start = 0
+    while (true) {
+      // The native indexOf will abort if the start index is out of bounds.
+      // Since the VM's implementation is strict, we add a guard here.
+      if (start >= str.count) return lastFound
+
+      index = str.indexOf(search, start)
+      if (index == -1) return lastFound
+      
+      lastFound = index
+      start = index + 1
+    }
+    return lastFound
+  }
+}
+
+// This class is used to signal the C host application to terminate.
+class Host {
+  foreign static signalDone()
+}
+
+// The main entry point for our script.
+var mainFiber = Fiber.new {
+  var crawler = Crawler.new("https://molodetz.nl")
+  crawler.run()
+}
+