|
#!/usr/bin/env python3
|
|
# retoor <retoor@molodetz.nl>
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
import yaml
|
|
from pathlib import Path
|
|
from html.parser import HTMLParser
|
|
from jinja2 import Environment, FileSystemLoader, ChoiceLoader
|
|
|
|
|
|
class SEOGenerator:
|
|
STOP_WORDS = {
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
|
|
'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them',
|
|
'we', 'us', 'you', 'your', 'he', 'she', 'him', 'her', 'i', 'my',
|
|
'if', 'then', 'else', 'when', 'where', 'why', 'how', 'what', 'which',
|
|
'who', 'whom', 'not', 'no', 'yes', 'all', 'any', 'both', 'each',
|
|
'more', 'most', 'other', 'some', 'such', 'only', 'same', 'so',
|
|
'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there'
|
|
}
|
|
|
|
WREN_TERMS = {
|
|
'wren', 'fiber', 'class', 'method', 'module', 'import', 'foreign',
|
|
'static', 'construct', 'scheduler', 'async', 'await', 'cli', 'api',
|
|
'json', 'http', 'websocket', 'sqlite', 'crypto', 'tls', 'regex'
|
|
}
|
|
|
|
def extract_keywords(self, text, title, max_keywords=10):
|
|
words = re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]{2,}\b', text.lower())
|
|
|
|
freq = {}
|
|
for word in words:
|
|
if word not in self.STOP_WORDS and len(word) > 2:
|
|
freq[word] = freq.get(word, 0) + 1
|
|
|
|
title_words = set(re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]+\b', title.lower()))
|
|
for word in freq:
|
|
if word in title_words:
|
|
freq[word] *= 3
|
|
if word in self.WREN_TERMS:
|
|
freq[word] *= 2
|
|
|
|
sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
|
return [word for word, _ in sorted_words[:max_keywords]]
|
|
|
|
def generate_description(self, text, title, max_length=155):
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
text = re.sub(r'^Skip to main content\s*', '', text)
|
|
text = re.sub(r'^Menu\s+', '', text)
|
|
text = re.sub(r'Wren-CLI\s+v[\d.]+\s*', '', text)
|
|
text = re.sub(r'Previous:.*?Next:.*?$', '', text)
|
|
text = text.strip()
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
sentences = [s for s in sentences if len(s) > 20 and not s.startswith(('import ', 'var ', '//'))]
|
|
|
|
if not sentences:
|
|
return f"{title} - Wren-CLI documentation and reference."
|
|
|
|
description = sentences[0]
|
|
|
|
if len(description) > max_length:
|
|
description = description[:max_length-3].rsplit(' ', 1)[0] + '...'
|
|
elif len(description) < 80 and len(sentences) > 1:
|
|
for s in sentences[1:]:
|
|
if len(description) + len(s) + 1 <= max_length:
|
|
description += ' ' + s
|
|
else:
|
|
break
|
|
|
|
return description
|
|
|
|
def extract_title(self, html):
|
|
match = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
|
if match:
|
|
return match.group(1).strip()
|
|
match = re.search(r'<title[^>]*>([^<]+)</title>', html)
|
|
if match:
|
|
title = match.group(1).strip()
|
|
if ' - ' in title:
|
|
return title.split(' - ')[0]
|
|
return title
|
|
return 'Wren-CLI Documentation'
|
|
|
|
|
|
class TextExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.text = []
|
|
self.skip_tags = {'script', 'style', 'nav', 'head', 'header', 'footer', 'aside'}
|
|
self.skip_depth = 0
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in self.skip_tags:
|
|
self.skip_depth += 1
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in self.skip_tags and self.skip_depth > 0:
|
|
self.skip_depth -= 1
|
|
|
|
def handle_data(self, data):
|
|
if self.skip_depth == 0:
|
|
text = data.strip()
|
|
if text:
|
|
self.text.append(text)
|
|
|
|
def get_text(self):
|
|
return ' '.join(self.text)
|
|
|
|
|
|
class LinkExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.links = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
if tag == 'a' and 'href' in attrs_dict:
|
|
self.links.append(attrs_dict['href'])
|
|
elif tag in ('img', 'script') and 'src' in attrs_dict:
|
|
self.links.append(attrs_dict['src'])
|
|
elif tag == 'link' and 'href' in attrs_dict:
|
|
self.links.append(attrs_dict['href'])
|
|
|
|
|
|
def extract_text(html):
|
|
parser = TextExtractor()
|
|
parser.feed(html)
|
|
return parser.get_text()
|
|
|
|
|
|
def escape_json_for_html(json_string):
|
|
return (json_string
|
|
.replace('</', '<\\/')
|
|
.replace('<!--', '<\\!--'))
|
|
|
|
|
|
class TemplateFormatter:
|
|
INDENT = ' '
|
|
|
|
def __init__(self):
|
|
self.fixes = []
|
|
|
|
def format_file(self, path, section=''):
|
|
content = path.read_text()
|
|
original = content
|
|
|
|
content = self._ensure_author_comment(content)
|
|
content = self._fix_article_indentation(content)
|
|
content = self._fix_navigation_urls(content, section)
|
|
|
|
if content != original:
|
|
path.write_text(content)
|
|
self.fixes.append(str(path))
|
|
return True
|
|
return False
|
|
|
|
def _ensure_author_comment(self, content):
|
|
if not content.startswith('{# retoor'):
|
|
return '{# retoor <retoor@molodetz.nl> #}\n' + content
|
|
return content
|
|
|
|
def _fix_article_indentation(self, content):
|
|
lines = content.split('\n')
|
|
result = []
|
|
in_article = False
|
|
|
|
for line in lines:
|
|
if '{% block article %}' in line:
|
|
in_article = True
|
|
result.append(line)
|
|
continue
|
|
if '{% endblock %}' in line and in_article:
|
|
in_article = False
|
|
result.append(line)
|
|
continue
|
|
|
|
if in_article and line and not line.startswith(' '):
|
|
stripped = line.strip()
|
|
if stripped.startswith('<') and not stripped.startswith('<!'):
|
|
line = self.INDENT + stripped
|
|
result.append(line)
|
|
|
|
return '\n'.join(result)
|
|
|
|
def _fix_navigation_urls(self, content, section):
|
|
import re
|
|
if not section or section == '.':
|
|
return content
|
|
|
|
def fix_url(match):
|
|
prefix = match.group(1)
|
|
url = match.group(2)
|
|
if '/' in url:
|
|
return match.group(0)
|
|
return f'{prefix}{section}/{url}'
|
|
|
|
def fix_prev_line(match):
|
|
line = match.group(0)
|
|
if '"url": "index.html"' in line and '"title": "Home"' in line:
|
|
return line
|
|
return re.sub(r'("url":\s*")([a-z0-9_-]+\.html")', lambda m: m.group(1) + section + '/' + m.group(2) if '/' not in m.group(2) else m.group(0), line)
|
|
|
|
content = re.sub(
|
|
r'{% set prev_page = \{[^}]+\} %}',
|
|
fix_prev_line,
|
|
content
|
|
)
|
|
content = re.sub(
|
|
r'(next_page\s*=\s*\{"url":\s*")([^"]+\.html")',
|
|
fix_url,
|
|
content
|
|
)
|
|
return content
|
|
|
|
def report(self):
|
|
if self.fixes:
|
|
print(f" Auto-formatted {len(self.fixes)} file(s):")
|
|
for f in self.fixes:
|
|
print(f" {f}")
|
|
|
|
|
|
class ManualBuilder:
|
|
def __init__(self):
|
|
self.root = Path(__file__).parent.parent
|
|
self.src = self.root / 'manual_src'
|
|
self.output = self.root / 'bin' / 'manual'
|
|
self.site = self.load_yaml('data/site.yaml')
|
|
self.nav = self.load_yaml('data/navigation.yaml')
|
|
self.seo = SEOGenerator()
|
|
|
|
templates_loader = FileSystemLoader(str(self.src / 'templates'))
|
|
pages_loader = FileSystemLoader(str(self.src))
|
|
|
|
self.env = Environment(
|
|
loader=ChoiceLoader([templates_loader, pages_loader]),
|
|
trim_blocks=True,
|
|
lstrip_blocks=True
|
|
)
|
|
|
|
self.env.globals.update({
|
|
'site': self.site,
|
|
'nav': self.nav
|
|
})
|
|
|
|
def load_yaml(self, path):
|
|
with open(self.src / path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def format_templates(self):
|
|
formatter = TemplateFormatter()
|
|
pages_dir = self.src / 'pages'
|
|
for html_file in pages_dir.rglob('*.html'):
|
|
rel_path = html_file.relative_to(pages_dir)
|
|
section = str(rel_path.parent) if rel_path.parent != Path('.') else ''
|
|
formatter.format_file(html_file, section)
|
|
formatter.report()
|
|
|
|
def build(self):
|
|
print("Formatting templates...")
|
|
self.format_templates()
|
|
|
|
if self.output.exists():
|
|
shutil.rmtree(self.output)
|
|
self.output.mkdir(parents=True)
|
|
|
|
self.build_pages()
|
|
search_index = self.build_search_index()
|
|
self.env.globals['search_index_json'] = escape_json_for_html(json.dumps(search_index))
|
|
self.rebuild_pages_with_index()
|
|
self.copy_static()
|
|
self.validate_links()
|
|
|
|
print(f"Built manual to {self.output}")
|
|
|
|
def build_pages(self):
|
|
pages_dir = self.src / 'pages'
|
|
for html_file in pages_dir.rglob('*.html'):
|
|
rel_path = html_file.relative_to(pages_dir)
|
|
self.build_page(html_file, rel_path)
|
|
|
|
def rebuild_pages_with_index(self):
|
|
pages_dir = self.src / 'pages'
|
|
for html_file in pages_dir.rglob('*.html'):
|
|
rel_path = html_file.relative_to(pages_dir)
|
|
self.build_page(html_file, rel_path)
|
|
|
|
def build_page(self, src_path, rel_path):
|
|
template_path = f'pages/{rel_path}'
|
|
template = self.env.get_template(template_path)
|
|
|
|
depth = len(rel_path.parts) - 1
|
|
static_prefix = '../' * depth if depth > 0 else './'
|
|
|
|
html = template.render(
|
|
current_path=str(rel_path),
|
|
static_prefix=static_prefix,
|
|
depth=depth,
|
|
seo={}
|
|
)
|
|
|
|
text = extract_text(html)
|
|
title = self.seo.extract_title(html)
|
|
|
|
seo = {
|
|
'keywords': self.seo.extract_keywords(text, title),
|
|
'description': self.seo.generate_description(text, title),
|
|
'og_title': f"{title} - Wren-CLI",
|
|
'og_type': 'article' if 'api/' in str(rel_path) or 'tutorials/' in str(rel_path) else 'website',
|
|
}
|
|
|
|
html = template.render(
|
|
current_path=str(rel_path),
|
|
static_prefix=static_prefix,
|
|
depth=depth,
|
|
seo=seo
|
|
)
|
|
|
|
out_path = self.output / rel_path
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_text(html)
|
|
print(f" {rel_path}")
|
|
|
|
def copy_static(self):
|
|
static_src = self.src / 'static'
|
|
for item in static_src.rglob('*'):
|
|
if item.is_file():
|
|
rel = item.relative_to(static_src)
|
|
dest = self.output / rel
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(item, dest)
|
|
|
|
wasm_src = self.root / 'bin' / 'wasm'
|
|
wasm_dest = self.output / 'wasm'
|
|
if wasm_src.exists():
|
|
wasm_dest.mkdir(parents=True, exist_ok=True)
|
|
for item in wasm_src.iterdir():
|
|
if item.is_file():
|
|
shutil.copy2(item, wasm_dest / item.name)
|
|
print(f" Copied WASM files to {wasm_dest}")
|
|
|
|
def build_search_index(self):
|
|
index = {'pages': []}
|
|
|
|
for section in self.nav['sections']:
|
|
section_title = section['title']
|
|
section_dir = section['directory']
|
|
|
|
for page in section.get('pages', []):
|
|
if section_dir == ".":
|
|
url = f"{page['file']}.html"
|
|
else:
|
|
url = f"{section_dir}/{page['file']}.html"
|
|
rendered_path = self.output / url
|
|
|
|
content = ''
|
|
if rendered_path.exists():
|
|
html = rendered_path.read_text()
|
|
content = extract_text(html)
|
|
content = ' '.join(content.split()[:500])
|
|
|
|
index['pages'].append({
|
|
'url': url,
|
|
'title': page['title'],
|
|
'section': section_title,
|
|
'description': page.get('description', ''),
|
|
'methods': page.get('methods', []),
|
|
'content': content
|
|
})
|
|
|
|
(self.output / 'search-index.json').write_text(
|
|
json.dumps(index, indent=2)
|
|
)
|
|
|
|
return index
|
|
|
|
def _is_local_link(self, link):
|
|
if not link:
|
|
return False
|
|
if link.startswith(('http://', 'https://', 'mailto:', 'javascript:', '#', 'data:')):
|
|
return False
|
|
return True
|
|
|
|
def _resolve_link(self, base_dir, link):
|
|
link = link.split('#')[0].split('?')[0]
|
|
if not link:
|
|
return base_dir / 'index.html'
|
|
return (base_dir / link).resolve()
|
|
|
|
def validate_links(self):
|
|
broken = []
|
|
for html_file in self.output.rglob('*.html'):
|
|
rel_path = html_file.relative_to(self.output)
|
|
html = html_file.read_text()
|
|
|
|
extractor = LinkExtractor()
|
|
extractor.feed(html)
|
|
|
|
for link in extractor.links:
|
|
if self._is_local_link(link):
|
|
target = self._resolve_link(html_file.parent, link)
|
|
if not target.exists():
|
|
broken.append((rel_path, link))
|
|
|
|
if broken:
|
|
print("Broken links found:")
|
|
for page, link in broken:
|
|
print(f" {page}: {link}")
|
|
raise SystemExit(1)
|
|
|
|
|
|
def main():
|
|
builder = ManualBuilder()
|
|
builder.build()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|