"""
BBCode markup language to HTML.
This module provides functionality to parse BBCode into HTML, sanitize the HTML,
and validate BBCode syntax, based on the implementation from osu-web.
Reference:
- https://osu.ppy.sh/wiki/BBCode
- https://github.com/ppy/osu-web/blob/master/app/Libraries/BBCodeFromDB.php
"""
import html
from typing import ClassVar
from app.models.userpage import (
ContentEmptyError,
ContentTooLongError,
ForbiddenTagError,
MaliciousBBCodeError,
)
import bleach
from bleach.css_sanitizer import CSSSanitizer
import regex as re
HTTP_PATTERN = re.compile(r"^https?://", re.IGNORECASE)
REGEX_TIMEOUT = 5
class BBCodeService:
"""A service for parsing and sanitizing BBCode content.
Attributes:
ALLOWED_TAGS: A list of allowed HTML tags in sanitized content.
ALLOWED_ATTRIBUTES: A dictionary mapping HTML tags to their allowed attributes.
FORBIDDEN_TAGS: A list of disallowed HTML tags that should not appear in user-generated content.
Methods:
parse_bbcode(text: str) -> str:
Parse BBCode text and convert it to HTML.
make_tag(tag: str, content: str, attributes: dict[str, str] | None = None, self_closing: bool = False) -> str:
Generate an HTML tag with optional attributes.
sanitize_html(html_content: str) -> str:
Clean and sanitize HTML content to prevent XSS attacks.
process_userpage_content(raw_content: str, max_length: int = 60000) -> dict[str, str]:
Process user page content based on osu-web's handling procedure.
"""
# allowed HTML tags in sanitized content
ALLOWED_TAGS: ClassVar[list[str]] = [
"a",
"audio",
"blockquote",
"br",
"button",
"center",
"code",
"del",
"div",
"em",
"h2",
"h4",
"iframe",
"img",
"li",
"ol",
"p",
"pre",
"span",
"strong",
"u",
"ul",
# imagemap
"map",
"area",
# custom box
"details",
"summary",
]
ALLOWED_ATTRIBUTES: ClassVar[dict[str, list[str]]] = {
"a": ["href", "rel", "class", "data-user-id", "target", "style", "title"],
"audio": ["controls", "preload", "src"],
"blockquote": [],
"button": ["type", "class", "style"],
"center": [],
"code": [],
"div": ["class", "style"],
"details": ["class"],
"h2": [],
"h4": [],
"iframe": ["class", "src", "allowfullscreen", "width", "height", "frameborder"],
"img": ["class", "loading", "src", "width", "height", "usemap", "alt", "style"],
"map": ["name"],
"area": ["href", "style", "title", "class"],
"ol": ["class"],
"span": ["class", "style", "title"],
"summary": [],
"ul": ["class"],
"*": ["class"],
}
# Disallowed tags that should not appear in user-generated content
FORBIDDEN_TAGS: ClassVar[list[str]] = [
"script",
"iframe",
"object",
"embed",
"form",
"input",
"textarea",
"select",
"option",
"meta",
"link",
"style",
"title",
"head",
"html",
"body",
]
@classmethod
def parse_bbcode(cls, text: str) -> str:
"""
Parse BBCode text and convert it to HTML.
Args:
text: Original text containing BBCode
Returns:
Converted HTML string
Reference:
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L354
"""
if not text:
return ""
text = html.escape(text)
try:
text = cls._parse_imagemap(text)
text = cls._parse_box(text)
text = cls._parse_code(text)
text = cls._parse_list(text)
text = cls._parse_notice(text)
text = cls._parse_quote(text)
text = cls._parse_heading(text)
# inline tags
text = cls._parse_audio(text)
text = cls._parse_bold(text)
text = cls._parse_centre(text)
text = cls._parse_inline_code(text)
text = cls._parse_colour(text)
text = cls._parse_email(text)
text = cls._parse_image(text)
text = cls._parse_italic(text)
text = cls._parse_size(text)
text = cls._parse_smilies(text)
text = cls._parse_spoiler(text)
text = cls._parse_strike(text)
text = cls._parse_underline(text)
text = cls._parse_url(text)
text = cls._parse_youtube(text)
text = cls._parse_profile(text)
except TimeoutError:
raise MaliciousBBCodeError("Regular expression processing timed out.")
# replace newlines with
text = text.replace("\n", "
")
return text
@classmethod
def make_tag(
cls,
tag: str,
content: str,
attributes: dict[str, str] | None = None,
self_closing: bool = False,
) -> str:
"""Generate an HTML tag with optional attributes."""
attr_str = ""
if attributes:
attr_parts = [f'{key}="{html.escape(value)}"' for key, value in attributes.items()]
attr_str = " " + " ".join(attr_parts)
if self_closing:
return f"<{tag}{attr_str} />"
else:
return f"<{tag}{attr_str}>{content}{tag}>"
@classmethod
def _parse_audio(cls, text: str) -> str:
"""
Parse [audio] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#audio
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L41
"""
pattern = r"\[audio\]([^\[]+)\[/audio\]"
def replace_audio(match):
url = match.group(1).strip()
return cls.make_tag("audio", "", attributes={"controls": "", "preload": "none", "src": url})
return re.sub(pattern, replace_audio, text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
@classmethod
def _parse_bold(cls, text: str) -> str:
"""
Parse [b] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#bold
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L55
"""
text = re.sub(r"\[b\]", "", text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
text = re.sub(r"\[/b\]", "", text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
return text
@classmethod
def _parse_box(cls, text: str) -> str:
"""
Parse [box] and [spoilerbox] tags.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#box
- https://osu.ppy.sh/wiki/en/BBCode#spoilerbox
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L63
"""
# [box=title] format
pattern = r"\[box=([^\]]+)\](.*?)\[/box\]"
def replace_box_with_title(match):
title = match.group(1)
content = match.group(2)
icon = cls.make_tag("span", "", attributes={"class": "bbcode-spoilerbox__link-icon"})
button_content = icon + title
button = cls.make_tag(
"button",
button_content,
attributes={
"type": "button",
"class": "js-spoilerbox__link bbcode-spoilerbox__link",
"style": (
"background: none; border: none; cursor: pointer; padding: 0; text-align: left; width: 100%;"
),
},
)
body = cls.make_tag("div", content, attributes={"class": "js-spoilerbox__body bbcode-spoilerbox__body"})
return cls.make_tag("div", button + body, attributes={"class": "js-spoilerbox bbcode-spoilerbox"})
text = re.sub(pattern, replace_box_with_title, text, flags=re.DOTALL | re.IGNORECASE, timeout=REGEX_TIMEOUT)
# [spoilerbox] format
pattern = r"\[spoilerbox\](.*?)\[/spoilerbox\]"
def replace_spoilerbox(match):
content = match.group(1)
icon = cls.make_tag("span", "", attributes={"class": "bbcode-spoilerbox__link-icon"})
button_content = icon + "SPOILER"
button = cls.make_tag(
"button",
button_content,
attributes={
"type": "button",
"class": "js-spoilerbox__link bbcode-spoilerbox__link",
"style": (
"background: none; border: none; cursor: pointer; padding: 0; text-align: left; width: 100%;"
),
},
)
body = cls.make_tag("div", content, attributes={"class": "js-spoilerbox__body bbcode-spoilerbox__body"})
return cls.make_tag("div", button + body, attributes={"class": "js-spoilerbox bbcode-spoilerbox"})
return re.sub(pattern, replace_spoilerbox, text, flags=re.DOTALL | re.IGNORECASE, timeout=REGEX_TIMEOUT)
@classmethod
def _parse_centre(cls, text: str) -> str:
"""
Parse [centre] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#centre
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L86
"""
text = re.sub(r"\[centre\]", "
", text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
text = re.sub(r"\[/c\]", "", text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
return text
@classmethod
def _parse_list(cls, text: str) -> str:
"""
Parse [list] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#formatted-lists
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L244
"""
# ordedred list
pattern = r"\[list=1\](.*?)\[/list\]"
def replace_ordered(match):
return cls.make_tag("ol", match.group(1))
text = re.sub(pattern, replace_ordered, text, flags=re.DOTALL | re.IGNORECASE, timeout=REGEX_TIMEOUT)
# unordered list
pattern = r"\[list\](.*?)\[/list\]"
def replace_unordered(match):
return cls.make_tag("ol", match.group(1), attributes={"class": "unordered"})
text = re.sub(
pattern,
replace_unordered,
text,
flags=re.DOTALL | re.IGNORECASE,
timeout=REGEX_TIMEOUT,
)
# list item
pattern = r"\[\*\]\s*(.*?)(?=\[\*\]|\[/list\]|$)"
def replace_item(match):
return cls.make_tag("li", match.group(1))
text = re.sub(pattern, replace_item, text, flags=re.DOTALL | re.IGNORECASE, timeout=REGEX_TIMEOUT)
return text
@classmethod
def _parse_notice(cls, text: str) -> str:
"""
Parse [notice] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#notice
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L264
"""
pattern = r"\[notice\]\n*(.*?)\n*\[/notice\]"
def replace_notice(match):
return cls.make_tag("div", match.group(1), attributes={"class": "well"})
return re.sub(
pattern,
replace_notice,
text,
flags=re.DOTALL | re.IGNORECASE,
timeout=REGEX_TIMEOUT,
)
@classmethod
def _parse_profile(cls, text: str) -> str:
"""
Parse [profile] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#profile
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L273
"""
pattern = r"\[profile(?:=(\d+))?\](.*?)\[/profile\]"
def replace_profile(match):
user_id = match.group(1)
username = match.group(2)
if user_id:
return cls.make_tag(
"a",
username,
attributes={"href": f"/users/{user_id}", "class": "user-profile-link", "data-user-id": user_id},
)
else:
return cls.make_tag(
"a", f"@{username}", attributes={"href": f"/users/@{username}", "class": "user-profile-link"}
)
return re.sub(pattern, replace_profile, text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
@classmethod
def _parse_quote(cls, text: str) -> str:
"""
Parse [quote] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#quote
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L285
"""
# [quote="author"]content[/quote]
# Handle both raw quotes and HTML-escaped quotes (")
pattern1 = r'\[quote=(?:"|")(.+?)(?:"|")\]\s*(.*?)\s*\[/quote\]'
def replace_quote1(match):
author = match.group(1)
content = match.group(2)
heading = cls.make_tag("h4", f"{author} wrote:")
return cls.make_tag("blockquote", heading + content)
text = re.sub(
pattern1,
replace_quote1,
text,
flags=re.DOTALL | re.IGNORECASE,
timeout=REGEX_TIMEOUT,
)
# [quote]content[/quote]
pattern2 = r"\[quote\]\s*(.*?)\s*\[/quote\]"
def replace_quote2(match):
return cls.make_tag("blockquote", match.group(1))
text = re.sub(
pattern2,
replace_quote2,
text,
flags=re.DOTALL | re.IGNORECASE,
timeout=REGEX_TIMEOUT,
)
return text
@classmethod
def _parse_size(cls, text: str) -> str:
"""
Parse [size] tag.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode#font-size
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L326
"""
def replace_size(match):
size = int(match.group(1))
# limit font size range (30-200%)
size = max(30, min(200, size))
return cls.make_tag("span", "", attributes={"style": f"font-size:{size}%"})
pattern = r"\[size=(\d+)\]"
text = re.sub(pattern, replace_size, text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
text = re.sub(r"\[/size\]", "", text, flags=re.IGNORECASE, timeout=REGEX_TIMEOUT)
return text
@classmethod
def _parse_smilies(cls, text: str) -> str:
"""
Parse smilies.
Reference:
- https://osu.ppy.sh/wiki/en/BBCode
- https://github.com/ppy/osu-web/blob/15e2d50067c8f5d3dfd2010a79a031efe0dfd10f/app/Libraries/BBCodeFromDB.php#L296
"""
# handle phpBB style smilies
pattern = r"