// crawler.wren
import "requests" for Requests, Response

class Crawler {
  construct new(baseUrl) {
    if (!baseUrl.endsWith("/")) {
      baseUrl = baseUrl + "/"
    }
    _baseUrl = baseUrl
    _toVisit = [baseUrl]
    _visited = {}
    _inFlight = 0
    _startTime = System.clock
  }

  run() {
    System.print("Starting crawler on base URL: %(_baseUrl)")
    crawlNext_() // Start the first batch of requests

    // The main event loop for the crawler. Keep yielding to the C host
    // as long as there is work to do. This keeps the fiber alive.
    while (_inFlight > 0 || _toVisit.count > 0) {
      Fiber.yield()
    }

    // Once the loop finishes, all crawling is done.
    var duration = System.clock - _startTime
    System.print("Crawling finished in %(duration.toString) seconds.")
    System.print("%(_visited.count) pages crawled.")
    Host.signalDone() // Signal the C host to exit
  }

  crawlNext_() {
    // Throttle requests to be a good web citizen.
    var maxInFlight = 4
    while (_toVisit.count > 0 && _inFlight < maxInFlight) {
      var url = _toVisit.removeAt(0)
      if (_visited.containsKey(url)) {
        continue
      }

      _visited[url] = true
      _inFlight = _inFlight + 1
      System.print("Crawling: %(url) (In flight: %(_inFlight))")

      Requests.get(url, null, Fn.new {|err, res|
        handleResponse_(err, res, url)
      })
    }
  }

  handleResponse_(err, res, url) {
    _inFlight = _inFlight - 1
    System.print("Finished: %(url) (In flight: %(_inFlight))")

    if (err != null) {
      System.print("Error crawling %(url): %(err)")
      crawlNext_() // A slot opened up, try to crawl more.
      return
    }

    if (res.statusCode >= 400) {
      System.print("Failed to crawl %(url) - Status: %(res.statusCode)")
      crawlNext_() // A slot opened up, try to crawl more.
      return
    }

    // The response body is already a string, no need to call toString().
    var body = res.body
    findLinks_(body, url)
    crawlNext_() // A slot opened up, try to crawl more.
  }

  findLinks_(html, pageUrl) {
    // A simple but effective way to find href attributes.
    // This looks for `href="` followed by any characters that are not a quote.
    var links = html.replace("href=\"", "href=\"\n").split("\n")
    
    for (line in links) {
        if (line.contains("\"")) {
            var parts = line.split("\"")
            if (parts.count > 1) {
                var link = parts[0]
                addUrl_(link, pageUrl)
            }
        }
    }
  }

  addUrl_(link, pageUrl) {
    // Ignore mailto, anchors, and other schemes
    if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return

    var newUrl = ""
    if (link.startsWith("http://") || link.startsWith("https://")) {
      newUrl = link
    } else if (link.startsWith("/")) {
      // Handle absolute paths
      var uri = parseUri_(_baseUrl)
      newUrl = "%(uri["scheme"])://%(uri["host"])%(link)"
    } else {
      // Handle relative paths
      var lastSlash = pageUrl.lastIndexOf("/")
      var base = pageUrl[0..lastSlash]
      newUrl = "%(base)/%(link)"
    }

    // Normalize URL to handle ".." and "."
    newUrl = normalizeUrl_(newUrl)

    // Only crawl URLs that are within the base URL's scope and haven't been seen.
    if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) {
      _toVisit.add(newUrl)
    }
  }

  parseUri_(url) {
    var parts = {}
    var schemeEnd = url.indexOf("://")
    parts["scheme"] = url[0...schemeEnd]
    var hostStart = schemeEnd + 3
    var hostEnd = url.indexOf("/", hostStart)
    if (hostEnd == -1) hostEnd = url.count
    parts["host"] = url[hostStart...hostEnd]
    return parts
  }

  normalizeUrl_(url) {
    var parts = url.split("/")
    var stack = []
    for (part in parts) {
      if (part == "" || part == ".") continue
      if (part == "..") {
        if (stack.count > 0) stack.removeAt(-1)
      } else {
        stack.add(part)
      }
    }
    
    var result = stack.join("/")
    // This is a bit of a hack to fix the double slashes after the scheme
    if (url.startsWith("http")) {
        return result.replace(":/", "://")
    }
    return result
  }
}

// This class is used to signal the C host application to terminate.
// The C code will look for this static method.
class Host {
  foreign static signalDone()
}

// The main entry point for our script.
var mainFiber = Fiber.new {
  var crawler = Crawler.new("https://molodetz.nl")
  crawler.run()
}