Merge pull request 'Improved site embedding' (#57) from BordedDev/snek:feat/improved-embeding into main

Reviewed-on: retoor/snek#57
This commit is contained in:
retoor 2025-06-29 16:24:33 +02:00
commit 006882cd6f

View File

@ -193,7 +193,7 @@ def embed_youtube(text):
"www.youtube-nocookie.com", "www.youtube-nocookie.com",
"youtube-nocookie.com", "youtube-nocookie.com",
] ]
and any(url.path.startswith(p) for p in ["/watch", "/embed"]) and any(url.path.startswith(p) for p in ["/watch", "/embed", "/shorts"])
): ):
queries = parse_qs(url.query) queries = parse_qs(url.query)
if "v" in queries: if "v" in queries:
@ -333,29 +333,11 @@ def get_url_content(url):
return None return None
def embed_url(text): def get_element_options(head_info, elem=None, meta=None, ograph=None, twitter=None):
soup = BeautifulSoup(text, "html.parser")
attachments = {}
for element in soup.find_all("a"):
if "href" in element.attrs and element.attrs["href"].startswith("http") and ("data-noembed" not in element.attrs):
page_url = urlparse(element.attrs["href"])
page = get_url_content(element.attrs["href"])
if page:
parsed_page = BeautifulSoup(page, "html.parser")
head_info = parsed_page.find("head")
if head_info:
def get_element_options(
elem=None, meta=None, ograph=None, twitter=None
):
if twitter: if twitter:
tw_tag = head_info.find( tw_tag = head_info.find(
"meta", attrs={"name": "twitter:" + twitter} "meta", attrs={"name": "twitter:" + twitter}
) or head_info.find( ) or head_info.find("meta", attrs={"property": "twitter:" + twitter})
"meta", attrs={"property": "twitter:" + twitter}
)
if tw_tag: if tw_tag:
return tw_tag.get("content", tw_tag.get("value", None)) return tw_tag.get("content", tw_tag.get("value", None))
@ -367,13 +349,11 @@ def embed_url(text):
return og_tag.get("content", og_tag.get("value", None)) return og_tag.get("content", og_tag.get("value", None))
if meta: if meta:
meta_tag = head_info.find( meta_tag = head_info.find("meta", attrs={"name": meta}) or head_info.find(
"meta", attrs={"name": meta} "meta", attrs={"property": meta}
) or head_info.find("meta", attrs={"property": meta})
if meta_tag:
return meta_tag.get(
"content", meta_tag.get("value", None)
) )
if meta_tag:
return meta_tag.get("content", meta_tag.get("value", None))
if elem: if elem:
elem_tag = head_info.find(elem) elem_tag = head_info.find(elem)
@ -382,33 +362,103 @@ def embed_url(text):
return None return None
def embed_url(text):
soup = BeautifulSoup(text, "html.parser")
attachments = {}
for element in soup.find_all("a"):
if (
"href" in element.attrs
and element.attrs["href"].startswith("http")
and element.attrs["href"] not in attachments
and ("data-noembed" not in element.attrs)
):
original_link_name = element.attrs["href"] original_link_name = element.attrs["href"]
if original_link_name in attachments: page_url = urlparse(element.attrs["href"])
page = get_url_content(element.attrs["href"])
if not page:
continue
parsed_page = BeautifulSoup(page, "html.parser")
head_info = parsed_page.find("head")
if not head_info:
continue continue
page_name = ( page_name = (
get_element_options("title", "title", "title", "title") get_element_options(head_info, "title", "title", "title", "title")
or page_url.netloc or page_url.netloc
) )
page_site = ( page_site = (
get_element_options(None, "site", "site", "site") get_element_options(head_info, None, "site", "site", "site")
or get_element_options(head_info, ograph="site_name")
or page_url.netloc or page_url.netloc
) )
page_description = get_element_options( page_description = get_element_options(
None, "description", "description", "description" head_info, None, "description", "description", "description"
) )
page_image = get_element_options(None, "image", "image", "image")
page_image_alt = get_element_options(
None, "image:alt", "image:alt", "image:alt"
)
page_video = get_element_options(None, "video", "video", "video")
page_audio = get_element_options(None, "audio", "audio", "audio")
( page_image = (
get_element_options(None, None, None, "card") get_element_options(head_info, None, "image", "image", "image")
or "summary_large_image" or get_element_options(
head_info, None, "image:url", "image:url", "image:url"
) )
or get_element_options(
head_info,
None,
"image:secure_url",
"image:secure_url",
"image:secure_url",
)
)
page_image_height = get_element_options(
head_info, None, "image:height", "image:height", "image:height"
)
page_image_width = get_element_options(
head_info, None, "image:width", "image:width", "image:width"
)
page_image_alt = get_element_options(
head_info, None, "image:alt", "image:alt", "image:alt"
)
page_video = (
get_element_options(head_info, None, "video", "video", "video")
or get_element_options(
head_info, None, "video:url", "video:url", "video:url"
)
or get_element_options(
head_info,
None,
"video:secure_url",
"video:secure_url",
"video:secure_url",
)
)
page_video_height = get_element_options(
head_info, None, "video:height", "video:height", "video:height"
)
page_video_width = get_element_options(
head_info, None, "video:width", "video:width", "video:width"
)
page_audio = (
get_element_options(head_info, None, "audio", "audio", "audio")
or get_element_options(
head_info, None, "audio:url", "audio:url", "audio:url"
)
or get_element_options(
head_info,
None,
"audio:secure_url",
"audio:secure_url",
"audio:secure_url",
)
)
(get_element_options(head_info, twitter="card") or "summary_large_image")
attachment_base = BeautifulSoup(str(element), "html.parser") attachment_base = BeautifulSoup(str(element), "html.parser")
attachments[original_link_name] = attachment_base attachments[original_link_name] = attachment_base
@ -421,54 +471,47 @@ def embed_url(text):
render_element = attachment render_element = attachment
if page_image: if page_image:
image_template = f'<span><img src="{page_image}" alt="{page_image_alt or page_name}" title="{page_name}" width="420" height="240" /></span>' style = {
render_element.append( "width": page_image_width + "px" if page_image_width else None,
BeautifulSoup(image_template, "html.parser") "height": page_image_height + "px" if page_image_height else None,
}
style_string = "; ".join(
f"{key}: {value}" for key, value in style.items() if value
) )
image_template = f'<span><img src="{page_image}" alt="{page_image_alt or page_name}" title="{page_name}" width="1" height="1" style="{style_string}" /></span>'
render_element.append(BeautifulSoup(image_template, "html.parser"))
if page_video: if page_video:
video_template = f'<video controls><source src="{page_video}">Your browser does not support the video tag.</video>' style = {
render_element.append( "width": page_video_width + "px" if page_video_width else None,
BeautifulSoup(video_template, "html.parser") "height": page_video_height + "px" if page_video_height else None,
}
style_string = "; ".join(
f"{key}: {value}" for key, value in style.items() if value
) )
video_template = f'<video controls style="{style_string}"><source src="{page_video}">Your browser does not support the video tag.</video>'
render_element.append(BeautifulSoup(video_template, "html.parser"))
if page_audio: if page_audio:
audio_template = f'<audio controls><source src="{page_audio}">Your browser does not support the audio tag.</audio>' audio_template = f'<audio controls><source src="{page_audio}">Your browser does not support the audio tag.</audio>'
render_element.append( render_element.append(BeautifulSoup(audio_template, "html.parser"))
BeautifulSoup(audio_template, "html.parser")
)
description_element_base = BeautifulSoup( description_element = BeautifulSoup(
"<span class='description'></span>", "html.parser" f"""
) <span class='description'>
description_element = next(description_element_base.children) <p class="page-site">{page_site}</p>
description_element.append( <strong class="page-name">{page_name}</strong>
BeautifulSoup( <p class='page-description'>{page_description or "No description available."}</p>
f'<p class="page-site">{page_site}</p>', <p class='page-original-link'>{original_link_name}</p>
</span>
""",
"html.parser", "html.parser",
) )
)
description_element.append( render_element.append(description_element)
BeautifulSoup(
f'<strong class="page-name">{page_name}</strong>',
"html.parser",
)
)
description_element.append(
BeautifulSoup(
f"<p class='page-description'>{page_description or 'No description available.'}</p>",
"html.parser",
)
)
description_element.append(
BeautifulSoup(
f"<p class='page-original-link'>{original_link_name}</p>",
"html.parser",
)
)
render_element.append(description_element_base)
for attachment in attachments.values(): for attachment in attachments.values():
soup.append(attachment) soup.append(attachment)