wren/util/build_manual.wren

// retoor <retoor@molodetz.nl>

import "io" for File, Directory
import "os" for Process
import "pathlib" for Path
import "yaml" for Yaml
import "json" for Json
import "jinja" for Environment, FileSystemLoader, ChoiceLoader
import "regex" for Regex
import "strutil" for Str

class TextExtractor {
  static SKIP_TAGS { ["script", "style", "nav", "head", "header", "footer", "aside"] }

  static extract(html) {
    var text = html
    for (tag in TextExtractor.SKIP_TAGS) {
      var pattern = Regex.new("<" + tag + "[^>]*>([^<]|<[^/]|</[^" + tag[0] + "])*</" + tag + ">", "gi")
      text = pattern.replaceAll(text, "")
    }
    var tagPattern = Regex.new("<[^>]+>", "g")
    text = tagPattern.replaceAll(text, " ")
    var whitespace = Regex.new("[ \t\n\r\f]+", "g")
    text = whitespace.replaceAll(text, " ")
    return text.trim()
  }
}

class SEOGenerator {
  static STOP_WORDS {
    return [
      "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
      "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
      "be", "have", "has", "had", "do", "does", "did", "will", "would",
      "could", "should", "may", "might", "must", "shall", "can", "need",
      "this", "that", "these", "those", "it", "its", "they", "them",
      "we", "us", "you", "your", "he", "she", "him", "her", "i", "my",
      "if", "then", "else", "when", "where", "why", "how", "what", "which",
      "who", "whom", "not", "no", "yes", "all", "any", "both", "each",
      "more", "most", "other", "some", "such", "only", "same", "so",
      "than", "too", "very", "just", "also", "now", "here", "there"
    ]
  }

  static WREN_TERMS {
    return [
      "wren", "fiber", "class", "method", "module", "import", "foreign",
      "static", "construct", "scheduler", "async", "await", "cli", "api",
      "json", "http", "websocket", "sqlite", "crypto", "tls", "regex"
    ]
  }

  static extractKeywords(text, title, maxKeywords) {
    var wordPattern = Regex.new("\\b[a-zA-Z][a-zA-Z0-9_]{2,}\\b", "g")
    var matches = wordPattern.matchAll(Str.toLower(text))
    var freq = {}
    var stopWords = SEOGenerator.STOP_WORDS
    for (match in matches) {
      var word = match.text
      if (word.count > 2 && !stopWords.contains(word)) {
        if (freq.containsKey(word)) {
          freq[word] = freq[word] + 1
        } else {
          freq[word] = 1
        }
      }
    }
    var titleMatches = wordPattern.matchAll(Str.toLower(title))
    var titleWords = []
    for (m in titleMatches) titleWords.add(m.text)
    var wrenTerms = SEOGenerator.WREN_TERMS
    for (word in freq.keys) {
      if (titleWords.contains(word)) freq[word] = freq[word] * 3
      if (wrenTerms.contains(word)) freq[word] = freq[word] * 2
    }
    var sorted = freq.keys.toList
    sorted.sort {|a, b| freq[b] - freq[a] }
    var result = []
    var count = 0
    for (word in sorted) {
      if (count >= maxKeywords) break
      result.add(word)
      count = count + 1
    }
    return result
  }

  static generateDescription(text, title, maxLength) {
    var ws = Regex.new("\\s+", "g")
    text = ws.replaceAll(text, " ").trim()
    var skipPattern = Regex.new("^Skip to main content\\s*", "")
    text = skipPattern.replaceAll(text, "")
    var menuPattern = Regex.new("^Menu\\s+", "")
    text = menuPattern.replaceAll(text, "")
    var versionPattern = Regex.new("Wren-CLI\\s+v[\\d.]+\\s*", "g")
    text = versionPattern.replaceAll(text, "")
    var navPattern = Regex.new("Previous:.*?Next:.*$", "")
    text = navPattern.replaceAll(text, "")
    text = text.trim()
    var sentencePattern = Regex.new("[.!?][ \t\n\r\f]+", "g")
    var sentences = sentencePattern.split(text)
    var filtered = []
    for (s in sentences) {
      var sLen = s.bytes.count
      if (sLen > 20 && !s.startsWith("import ") && !s.startsWith("var ") && !s.startsWith("//")) {
        filtered.add(s)
      }
    }
    if (filtered.isEmpty) return "%(title) - Wren-CLI documentation and reference."
    var description = filtered[0]
    var descLen = description.bytes.count
    if (descLen > maxLength) {
      description = description[0...(maxLength - 3)]
      descLen = description.bytes.count
      var lastSpace = descLen - 1
      while (lastSpace > 0 && description[lastSpace] != " ") lastSpace = lastSpace - 1
      if (lastSpace > 0) description = description[0...lastSpace]
      description = description + "..."
    } else if (descLen < 80 && filtered.count > 1) {
      var i = 1
      var fCount = filtered.count
      while (i < fCount) {
        var addLen = filtered[i].bytes.count
        if (descLen + addLen + 1 <= maxLength) {
          description = description + " " + filtered[i]
          descLen = descLen + addLen + 1
        } else {
          break
        }
        i = i + 1
      }
    }
    return description
  }

  static extractTitle(html) {
    var h1Pattern = Regex.new("<h1[^>]*>([^<]+)</h1>", "i")
    var match = h1Pattern.match(html)
    if (match) return match.group(1).trim()
    var titlePattern = Regex.new("<title[^>]*>([^<]+)</title>", "i")
    match = titlePattern.match(html)
    if (match) {
      var title = match.group(1).trim()
      if (title.contains(" - ")) {
        var parts = title.split(" - ")
        return parts[0]
      }
      return title
    }
    return "Wren-CLI Documentation"
  }
}

class TemplateFormatter {
  static INDENT { "                " }

  construct new() {
    _fixes = []
  }

  fixes { _fixes }

  formatFile(path, section) {
    var content = path.readText()
    var original = content
    content = ensureAuthorComment_(content)
    content = fixArticleIndentation_(content)
    content = fixNavigationUrls_(content, section)
    if (content != original) {
      path.writeText(content)
      _fixes.add(path.toString)
      return true
    }
    return false
  }

  ensureAuthorComment_(content) {
    if (!content.startsWith("{# retoor")) {
      return "{# retoor <retoor@molodetz.nl> #}" + "\n" + content
    }
    return content
  }

  fixArticleIndentation_(content) {
    var lines = content.split("\n")
    var result = []
    var inArticle = false
    for (line in lines) {
      if (line.contains("{\x25 block article \x25}")) {
        inArticle = true
        result.add(line)
        continue
      }
      if (line.contains("{\x25 endblock \x25}") && inArticle) {
        inArticle = false
        result.add(line)
        continue
      }
      if (inArticle && !line.isEmpty && !line.startsWith(" ")) {
        var stripped = line.trim()
        if (stripped.startsWith("<") && !stripped.startsWith("<!")) {
          line = TemplateFormatter.INDENT + stripped
        }
      }
      result.add(line)
    }
    return result.join("\n")
  }

  fixNavigationUrls_(content, section) {
    return content
  }

  report() {
    if (!_fixes.isEmpty) {
      System.print("  Auto-formatted %(_fixes.count) file(s):")
      for (f in _fixes) {
        System.print("    %(f)")
      }
    }
  }
}

class ManualBuilder {
  construct new() {
    _root = Path.new(Process.cwd)
    _src = _root / "manual_src"
    _output = _root / "bin" / "manual"
    _site = Yaml.parse((_src / "data/site.yaml").readText())
    _nav = Yaml.parse((_src / "data/navigation.yaml").readText())
    var templatesLoader = FileSystemLoader.new((_src / "templates").toString)
    var pagesLoader = FileSystemLoader.new(_src.toString)
    _env = Environment.new(ChoiceLoader.new([templatesLoader, pagesLoader]))
    _searchIndexJson = ""
  }

  build() {
    System.print("[DEBUG] Starting build...")
    System.print("[DEBUG] Checking output exists...")
    if (_output.exists()) {
      System.print("[DEBUG] Removing old output...")
      _output.rmtree()
    }
    System.print("[DEBUG] Creating output directory...")
    _output.mkdir(true)
    System.print("[DEBUG] Building pages...")
    buildPages()
    System.print("[DEBUG] Building search index...")
    var searchIndex = buildSearchIndex()
    System.print("[DEBUG] Converting search index to JSON...")
    _searchIndexJson = escapeJsonForHtml_(Json.stringify(searchIndex))
    System.print("[DEBUG] Rebuilding pages with index...")
    rebuildPagesWithIndex()
    System.print("[DEBUG] Copying static files...")
    copyStatic()
    System.print("Built manual to %(_output)")
  }

  formatTemplates() {
    System.print("[DEBUG] formatTemplates: skipped (slow in Wren)")
  }

  buildPages() {
    var pagesDir = _src / "pages"
    for (htmlFile in pagesDir.rglob("*.html")) {
      var relPath = htmlFile.relativeTo(pagesDir)
      buildPage(htmlFile, relPath)
    }
  }

  rebuildPagesWithIndex() {
    var pagesDir = _src / "pages"
    for (htmlFile in pagesDir.rglob("*.html")) {
      var relPath = htmlFile.relativeTo(pagesDir)
      buildPage(htmlFile, relPath)
    }
  }

  buildPage(srcPath, relPath) {
    System.print("[DEBUG] buildPage: %(relPath)")
    var templatePath = "pages/" + relPath.toString
    System.print("[DEBUG]   Getting template: %(templatePath)")
    var template = _env.getTemplate(templatePath)
    System.print("[DEBUG]   Template loaded")
    var depth = relPath.parts.count - 1
    var staticPrefix = depth > 0 ? ("../" * depth) : "./"
    var context = {
      "current_path": relPath.toString,
      "static_prefix": staticPrefix,
      "depth": depth,
      "seo": {},
      "site": _site,
      "nav": _nav,
      "search_index_json": _searchIndexJson
    }
    var html = template.render(context)
    var text = TextExtractor.extract(html)
    var title = SEOGenerator.extractTitle(html)
    var relStr = relPath.toString
    var ogType = (relStr.contains("api/") || relStr.contains("tutorials/")) ? "article" : "website"
    var seo = {
      "keywords": SEOGenerator.extractKeywords(text, title, 10),
      "description": SEOGenerator.generateDescription(text, title, 155),
      "og_title": "%(title) - Wren-CLI",
      "og_type": ogType
    }
    context["seo"] = seo
    html = template.render(context)
    var outPath = _output / relPath
    outPath.parent.mkdir(true)
    outPath.writeText(html)
    System.print("  %(relPath)")
  }

  copyStatic() {
    var staticSrc = _src / "static"
    for (entry in staticSrc.walk()) {
      var root = entry[0]
      var files = entry[2]
      for (f in files) {
        var item = root / f
        var rel = item.relativeTo(staticSrc)
        var dest = _output / rel
        dest.parent.mkdir(true)
        item.copyfile(dest)
      }
    }
  }

  buildSearchIndex() {
    var index = {"pages": []}
    for (section in _nav["sections"]) {
      var sectionTitle = section["title"]
      var sectionDir = section["directory"]
      var pages = section["pages"]
      if (pages == null) pages = []
      for (page in pages) {
        var url = "%(sectionDir)/%(page["file"]).html"
        var renderedPath = _output / url
        var content = ""
        if (renderedPath.exists()) {
          var html = renderedPath.readText()
          content = TextExtractor.extract(html)
          var words = content.split(" ")
          if (words.count > 500) {
            content = words[0...500].join(" ")
          }
        }
        var description = page["description"]
        if (description == null) description = ""
        var methods = page["methods"]
        if (methods == null) methods = []
        index["pages"].add({
          "url": url,
          "title": page["title"],
          "section": sectionTitle,
          "description": description,
          "methods": methods,
          "content": content
        })
      }
    }
    (_output / "search-index.json").writeText(Json.stringify(index, 2))
    return index
  }

  escapeJsonForHtml_(jsonString) {
    return jsonString.replace("</", "<\\/").replace("<!--", "<\\!--")
  }
}

var builder = ManualBuilder.new()
builder.build()