Crawler.
This commit is contained in:
parent
e01148ba37
commit
da5d5cc3f4
176
crawler.wren
Normal file
176
crawler.wren
Normal file
@ -0,0 +1,176 @@
|
||||
// crawler.wren
|
||||
import "requests" for Requests, Response
|
||||
|
||||
class Crawler {
|
||||
construct new(baseUrl) {
|
||||
if (!baseUrl.endsWith("/")) {
|
||||
baseUrl = baseUrl + "/"
|
||||
}
|
||||
_baseUrl = baseUrl
|
||||
_toVisit = [baseUrl]
|
||||
_visited = {}
|
||||
_inFlight = 0
|
||||
_startTime = System.clock
|
||||
}
|
||||
|
||||
run() {
|
||||
System.print("Starting crawler on base URL: %(_baseUrl)")
|
||||
crawlNext_() // Start the first batch of requests
|
||||
|
||||
// The main event loop for the crawler. Keep yielding to the C host
|
||||
// as long as there is work to do. This keeps the fiber alive.
|
||||
while (_inFlight > 0 || _toVisit.count > 0) {
|
||||
Fiber.yield()
|
||||
}
|
||||
|
||||
// Once the loop finishes, all crawling is done.
|
||||
var duration = System.clock - _startTime
|
||||
System.print("Crawling finished in %(duration.toString) seconds.")
|
||||
System.print("%(_visited.count) pages crawled.")
|
||||
Host.signalDone() // Signal the C host to exit
|
||||
}
|
||||
|
||||
crawlNext_() {
|
||||
// Throttle requests to be a good web citizen.
|
||||
var maxInFlight = 10
|
||||
while (_toVisit.count > 0 && _inFlight < maxInFlight) {
|
||||
var url = _toVisit.removeAt(0)
|
||||
if (_visited.containsKey(url)) {
|
||||
continue
|
||||
}
|
||||
|
||||
_visited[url] = true
|
||||
_inFlight = _inFlight + 1
|
||||
System.print("Crawling: %(url) (In flight: %(_inFlight))")
|
||||
|
||||
Requests.get(url, null, Fn.new {|err, res|
|
||||
handleResponse_(err, res, url)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
handleResponse_(err, res, url) {
|
||||
_inFlight = _inFlight - 1
|
||||
System.print("Finished: %(url) (In flight: %(_inFlight))")
|
||||
|
||||
if (err != null) {
|
||||
System.print("Error crawling %(url): %(err)")
|
||||
crawlNext_() // A slot opened up, try to crawl more.
|
||||
return
|
||||
}
|
||||
|
||||
if (res.statusCode >= 400) {
|
||||
System.print("Failed to crawl %(url) - Status: %(res.statusCode)")
|
||||
crawlNext_() // A slot opened up, try to crawl more.
|
||||
return
|
||||
}
|
||||
|
||||
var body = res.body
|
||||
findLinks_(body, url)
|
||||
crawlNext_() // A slot opened up, try to crawl more.
|
||||
}
|
||||
|
||||
findLinks_(html, pageUrl) {
|
||||
// Use the native replace and split methods.
|
||||
var links = html.replace("href=\"", "href=\"\n").split("\n")
|
||||
|
||||
for (line in links) {
|
||||
if (line.contains("\"")) {
|
||||
var parts = line.split("\"")
|
||||
if (parts.count > 1) {
|
||||
var link = parts[0]
|
||||
addUrl_(link, pageUrl)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addUrl_(link, pageUrl) {
|
||||
// Ignore mailto, anchors, and other schemes
|
||||
if (link.startsWith("mailto:") || link.startsWith("#") || link.startsWith("javascript:")) return
|
||||
|
||||
var newUrl = ""
|
||||
if (link.startsWith("http://") || link.startsWith("https://")) {
|
||||
newUrl = link
|
||||
} else if (link.startsWith("/")) {
|
||||
// Handle absolute paths
|
||||
var uri = parseUri_(_baseUrl)
|
||||
newUrl = "%(uri["scheme"])://%(uri["host"])%(link)"
|
||||
} else {
|
||||
// Handle relative paths
|
||||
var lastSlash = lastIndexOf_(pageUrl, "/")
|
||||
var base = pageUrl[0..lastSlash]
|
||||
newUrl = "%(base)%(link)"
|
||||
}
|
||||
|
||||
// Normalize URL to handle ".." and "."
|
||||
newUrl = normalizeUrl_(newUrl)
|
||||
|
||||
// Only crawl URLs that are within the base URL's scope and haven't been seen.
|
||||
if (newUrl.startsWith(_baseUrl) && !_visited.containsKey(newUrl) && !_toVisit.contains(newUrl)) {
|
||||
_toVisit.add(newUrl)
|
||||
}
|
||||
}
|
||||
|
||||
parseUri_(url) {
|
||||
var parts = {}
|
||||
var schemeEnd = url.indexOf("://")
|
||||
parts["scheme"] = url[0...schemeEnd]
|
||||
var hostStart = schemeEnd + 3
|
||||
var hostEnd = url.indexOf("/", hostStart)
|
||||
if (hostEnd == -1) hostEnd = url.count
|
||||
parts["host"] = url[hostStart...hostEnd]
|
||||
return parts
|
||||
}
|
||||
|
||||
normalizeUrl_(url) {
|
||||
var parts = url.split("/")
|
||||
var stack = []
|
||||
for (part in parts) {
|
||||
if (part == "" || part == ".") continue
|
||||
if (part == "..") {
|
||||
if (stack.count > 0) stack.removeAt(-1)
|
||||
} else {
|
||||
stack.add(part)
|
||||
}
|
||||
}
|
||||
|
||||
var result = stack.join("/")
|
||||
// This is a bit of a hack to fix the double slashes after the scheme
|
||||
if (url.startsWith("http")) {
|
||||
return result.replace(":/", "://")
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Helper method to find the last index of a substring.
|
||||
lastIndexOf_(str, search) {
|
||||
var index = -1
|
||||
var lastFound = -1
|
||||
var start = 0
|
||||
while (true) {
|
||||
// The native indexOf will abort if the start index is out of bounds.
|
||||
// Since the VM's implementation is strict, we add a guard here.
|
||||
if (start >= str.count) return lastFound
|
||||
|
||||
index = str.indexOf(search, start)
|
||||
if (index == -1) return lastFound
|
||||
|
||||
lastFound = index
|
||||
start = index + 1
|
||||
}
|
||||
return lastFound
|
||||
}
|
||||
}
|
||||
|
||||
// This class is used to signal the C host application to terminate.
|
||||
class Host {
|
||||
foreign static signalDone()
|
||||
}
|
||||
|
||||
// The main entry point for our script.
|
||||
var mainFiber = Fiber.new {
|
||||
var crawler = Crawler.new("https://molodetz.nl")
|
||||
crawler.run()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user