wren/build_manual.py at main

 #!/usr/bin/env python3
 # retoor <retoor@molodetz.nl>
 import json
 import re
 import shutil
 import yaml
 from pathlib import Path
 from html.parser import HTMLParser
 from jinja2 import Environment, FileSystemLoader, ChoiceLoader
 class SEOGenerator:
     STOP_WORDS = {
         'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
         'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
         'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
         'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
         'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them',
         'we', 'us', 'you', 'your', 'he', 'she', 'him', 'her', 'i', 'my',
         'if', 'then', 'else', 'when', 'where', 'why', 'how', 'what', 'which',
         'who', 'whom', 'not', 'no', 'yes', 'all', 'any', 'both', 'each',
         'more', 'most', 'other', 'some', 'such', 'only', 'same', 'so',
         'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there'
     }
     WREN_TERMS = {
         'wren', 'fiber', 'class', 'method', 'module', 'import', 'foreign',
         'static', 'construct', 'scheduler', 'async', 'await', 'cli', 'api',
         'json', 'http', 'websocket', 'sqlite', 'crypto', 'tls', 'regex'
     }
     def extract_keywords(self, text, title, max_keywords=10):
         words = re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]{2,}\b', text.lower())
         freq = {}
         for word in words:
             if word not in self.STOP_WORDS and len(word) > 2:
                 freq[word] = freq.get(word, 0) + 1
         title_words = set(re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]+\b', title.lower()))
         for word in freq:
             if word in title_words:
                 freq[word] *= 3
             if word in self.WREN_TERMS:
                 freq[word] *= 2
         sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
         return [word for word, _ in sorted_words[:max_keywords]]
     def generate_description(self, text, title, max_length=155):
         text = re.sub(r'\s+', ' ', text).strip()
         text = re.sub(r'^Skip to main content\s*', '', text)
         text = re.sub(r'^Menu\s+', '', text)
         text = re.sub(r'Wren-CLI\s+v[\d.]+\s*', '', text)
         text = re.sub(r'Previous:.*?Next:.*?$', '', text)
         text = text.strip()
         sentences = re.split(r'(?<=[.!?])\s+', text)
         sentences = [s for s in sentences if len(s) > 20 and not s.startswith(('import ', 'var ', '//'))]
         if not sentences:
             return f"{title} - Wren-CLI documentation and reference."
         description = sentences[0]
         if len(description) > max_length:
             description = description[:max_length-3].rsplit(' ', 1)[0] + '...'
         elif len(description) < 80 and len(sentences) > 1:
             for s in sentences[1:]:
                 if len(description) + len(s) + 1 <= max_length:
                     description += ' ' + s
                 else:
                     break
         return description
     def extract_title(self, html):
         match = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
         if match:
             return match.group(1).strip()
         match = re.search(r'<title[^>]*>([^<]+)</title>', html)
         if match:
             title = match.group(1).strip()
             if ' - ' in title:
                 return title.split(' - ')[0]
             return title
         return 'Wren-CLI Documentation'
 class TextExtractor(HTMLParser):
     def __init__(self):
         super().__init__()
         self.text = []
         self.skip_tags = {'script', 'style', 'nav', 'head', 'header', 'footer', 'aside'}
         self.skip_depth = 0
     def handle_starttag(self, tag, attrs):
         if tag in self.skip_tags:
             self.skip_depth += 1
     def handle_endtag(self, tag):
         if tag in self.skip_tags and self.skip_depth > 0:
             self.skip_depth -= 1
     def handle_data(self, data):
         if self.skip_depth == 0:
             text = data.strip()
             if text:
                 self.text.append(text)
     def get_text(self):
         return ' '.join(self.text)
 class LinkExtractor(HTMLParser):
     def __init__(self):
         super().__init__()
         self.links = []
     def handle_starttag(self, tag, attrs):
         attrs_dict = dict(attrs)
         if tag == 'a' and 'href' in attrs_dict:
             self.links.append(attrs_dict['href'])
         elif tag in ('img', 'script') and 'src' in attrs_dict:
             self.links.append(attrs_dict['src'])
         elif tag == 'link' and 'href' in attrs_dict:
             self.links.append(attrs_dict['href'])
 def extract_text(html):
     parser = TextExtractor()
     parser.feed(html)
     return parser.get_text()
 def escape_json_for_html(json_string):
     return (json_string
         .replace('</', '<\\/')
         .replace('<!--', '<\\!--'))
 class TemplateFormatter:
     INDENT = '                '
     def __init__(self):
         self.fixes = []
     def format_file(self, path, section=''):
         content = path.read_text()
         original = content
         content = self._ensure_author_comment(content)
         content = self._fix_article_indentation(content)
         content = self._fix_navigation_urls(content, section)
         if content != original:
             path.write_text(content)
             self.fixes.append(str(path))
             return True
         return False
     def _ensure_author_comment(self, content):
         if not content.startswith('{# retoor'):
             return '{# retoor <retoor@molodetz.nl> #}\n' + content
         return content
     def _fix_article_indentation(self, content):
         lines = content.split('\n')
         result = []
         in_article = False
         for line in lines:
             if '{% block article %}' in line:
                 in_article = True
                 result.append(line)
                 continue
             if '{% endblock %}' in line and in_article:
                 in_article = False
                 result.append(line)
                 continue
             if in_article and line and not line.startswith(' '):
                 stripped = line.strip()
                 if stripped.startswith('<') and not stripped.startswith('<!'):
                     line = self.INDENT + stripped
             result.append(line)
         return '\n'.join(result)
     def _fix_navigation_urls(self, content, section):
         import re
         if not section or section == '.':
             return content
         def fix_url(match):
             prefix = match.group(1)
             url = match.group(2)
             if '/' in url:
                 return match.group(0)
             return f'{prefix}{section}/{url}'
         def fix_prev_line(match):
             line = match.group(0)
             if '"url": "index.html"' in line and '"title": "Home"' in line:
                 return line
             return re.sub(r'("url":\s*")([a-z0-9_-]+\.html")', lambda m: m.group(1) + section + '/' + m.group(2) if '/' not in m.group(2) else m.group(0), line)
         content = re.sub(
             r'{% set prev_page = \{[^}]+\} %}',
             fix_prev_line,
             content
         )
         content = re.sub(
             r'(next_page\s*=\s*\{"url":\s*")([^"]+\.html")',
             fix_url,
             content
         )
         return content
     def report(self):
         if self.fixes:
             print(f"  Auto-formatted {len(self.fixes)} file(s):")
             for f in self.fixes:
                 print(f"    {f}")
 class ManualBuilder:
     def __init__(self):
         self.root = Path(__file__).parent.parent
         self.src = self.root / 'manual_src'
         self.output = self.root / 'bin' / 'manual'
         self.site = self.load_yaml('data/site.yaml')
         self.nav = self.load_yaml('data/navigation.yaml')
         self.seo = SEOGenerator()
         templates_loader = FileSystemLoader(str(self.src / 'templates'))
         pages_loader = FileSystemLoader(str(self.src))
         self.env = Environment(
             loader=ChoiceLoader([templates_loader, pages_loader]),
             trim_blocks=True,
             lstrip_blocks=True
         )
         self.env.globals.update({
             'site': self.site,
             'nav': self.nav
         })
     def load_yaml(self, path):
         with open(self.src / path) as f:
             return yaml.safe_load(f)
     def format_templates(self):
         formatter = TemplateFormatter()
         pages_dir = self.src / 'pages'
         for html_file in pages_dir.rglob('*.html'):
             rel_path = html_file.relative_to(pages_dir)
             section = str(rel_path.parent) if rel_path.parent != Path('.') else ''
             formatter.format_file(html_file, section)
         formatter.report()
     def build(self):
         print("Formatting templates...")
         self.format_templates()
         if self.output.exists():
             shutil.rmtree(self.output)
         self.output.mkdir(parents=True)
         self.build_pages()
         search_index = self.build_search_index()
         self.env.globals['search_index_json'] = escape_json_for_html(json.dumps(search_index))
         self.rebuild_pages_with_index()
         self.copy_static()
         self.validate_links()
         print(f"Built manual to {self.output}")
     def build_pages(self):
         pages_dir = self.src / 'pages'
         for html_file in pages_dir.rglob('*.html'):
             rel_path = html_file.relative_to(pages_dir)
             self.build_page(html_file, rel_path)
     def rebuild_pages_with_index(self):
         pages_dir = self.src / 'pages'
         for html_file in pages_dir.rglob('*.html'):
             rel_path = html_file.relative_to(pages_dir)
             self.build_page(html_file, rel_path)
     def build_page(self, src_path, rel_path):
         template_path = f'pages/{rel_path}'
         template = self.env.get_template(template_path)
         depth = len(rel_path.parts) - 1
         static_prefix = '../' * depth if depth > 0 else './'
         html = template.render(
             current_path=str(rel_path),
             static_prefix=static_prefix,
             depth=depth,
             seo={}
         )
         text = extract_text(html)
         title = self.seo.extract_title(html)
         seo = {
             'keywords': self.seo.extract_keywords(text, title),
             'description': self.seo.generate_description(text, title),
             'og_title': f"{title} - Wren-CLI",
             'og_type': 'article' if 'api/' in str(rel_path) or 'tutorials/' in str(rel_path) else 'website',
         }
         html = template.render(
             current_path=str(rel_path),
             static_prefix=static_prefix,
             depth=depth,
             seo=seo
         )
         out_path = self.output / rel_path
         out_path.parent.mkdir(parents=True, exist_ok=True)
         out_path.write_text(html)
         print(f"  {rel_path}")
     def copy_static(self):
         static_src = self.src / 'static'
         for item in static_src.rglob('*'):
             if item.is_file():
                 rel = item.relative_to(static_src)
                 dest = self.output / rel
                 dest.parent.mkdir(parents=True, exist_ok=True)
                 shutil.copy2(item, dest)
         wasm_src = self.root / 'bin' / 'wasm'
         wasm_dest = self.output / 'wasm'
         if wasm_src.exists():
             wasm_dest.mkdir(parents=True, exist_ok=True)
             for item in wasm_src.iterdir():
                 if item.is_file():
                     shutil.copy2(item, wasm_dest / item.name)
             print(f"  Copied WASM files to {wasm_dest}")
     def build_search_index(self):
         index = {'pages': []}
         for section in self.nav['sections']:
             section_title = section['title']
             section_dir = section['directory']
             for page in section.get('pages', []):
                 if section_dir == ".":
                     url = f"{page['file']}.html"
                 else:
                     url = f"{section_dir}/{page['file']}.html"
                 rendered_path = self.output / url
                 content = ''
                 if rendered_path.exists():
                     html = rendered_path.read_text()
                     content = extract_text(html)
                     content = ' '.join(content.split()[:500])
                 index['pages'].append({
                     'url': url,
                     'title': page['title'],
                     'section': section_title,
                     'description': page.get('description', ''),
                     'methods': page.get('methods', []),
                     'content': content
                 })
         (self.output / 'search-index.json').write_text(
             json.dumps(index, indent=2)
         )
         return index
     def _is_local_link(self, link):
         if not link:
             return False
         if link.startswith(('http://', 'https://', 'mailto:', 'javascript:', '#', 'data:')):
             return False
         return True
     def _resolve_link(self, base_dir, link):
         link = link.split('#')[0].split('?')[0]
         if not link:
             return base_dir / 'index.html'
         return (base_dir / link).resolve()
     def validate_links(self):
         broken = []
         for html_file in self.output.rglob('*.html'):
             rel_path = html_file.relative_to(self.output)
             html = html_file.read_text()
             extractor = LinkExtractor()
             extractor.feed(html)
             for link in extractor.links:
                 if self._is_local_link(link):
                     target = self._resolve_link(html_file.parent, link)
                     if not target.exists():
                         broken.append((rel_path, link))
         if broken:
             print("Broken links found:")
             for page, link in broken:
                 print(f"  {page}: {link}")
             raise SystemExit(1)
 def main():
     builder = ManualBuilder()
     builder.build()
 if __name__ == '__main__':
     main()