""" BBCode处理服务 基于 osu-web 官方实现的 BBCode 解析器 支持所有 osu! 官方 BBCode 标签 """ import html import re from typing import ClassVar from app.models.userpage import ( ContentEmptyError, ContentTooLongError, ForbiddenTagError, ) import bleach from bleach.css_sanitizer import CSSSanitizer class BBCodeService: """BBCode处理服务类 - 基于 osu-web 官方实现""" # 允许的HTML标签和属性 - 基于官方实现 ALLOWED_TAGS: ClassVar[list[str]] = [ "a", "audio", "blockquote", "br", "button", "center", "code", "del", "div", "em", "h2", "h4", "iframe", "img", "li", "ol", "p", "pre", "span", "strong", "u", "ul", # imagemap 相关 "map", "area", # 自定义容器 "details", "summary", ] ALLOWED_ATTRIBUTES: ClassVar[dict[str, list[str]]] = { "a": ["href", "rel", "class", "data-user-id", "target", "style", "title"], "audio": ["controls", "preload", "src"], "blockquote": [], "button": ["type", "class", "style"], "center": [], "code": [], "div": ["class", "style"], "details": ["class"], "h2": [], "h4": [], "iframe": ["class", "src", "allowfullscreen", "width", "height", "frameborder"], "img": ["class", "loading", "src", "width", "height", "usemap", "alt", "style"], "map": ["name"], "area": ["href", "style", "title", "class"], "ol": ["class"], "span": ["class", "style", "title"], "summary": [], "ul": ["class"], "*": ["class"], } # 危险的BBCode标签(不允许) FORBIDDEN_TAGS: ClassVar[list[str]] = [ "script", "iframe", "object", "embed", "form", "input", "textarea", "select", "option", "meta", "link", "style", "title", "head", "html", "body", ] @classmethod def parse_bbcode(cls, text: str) -> str: """ 解析BBCode文本并转换为HTML 基于 osu-web BBCodeFromDB.php 的实现 Args: text: 包含BBCode的原始文本 Returns: 转换后的HTML字符串 """ if not text: return "" # 预处理:转义HTML实体 text = html.escape(text) # 按照 osu-web 的解析顺序进行处理 # 块级标签处理 text = cls._parse_imagemap(text) text = cls._parse_box(text) text = cls._parse_code(text) text = cls._parse_list(text) text = cls._parse_notice(text) text = cls._parse_quote(text) text = cls._parse_heading(text) # 行内标签处理 text = cls._parse_audio(text) text = cls._parse_bold(text) text = cls._parse_centre(text) text = cls._parse_inline_code(text) text = cls._parse_colour(text) text = cls._parse_email(text) text = cls._parse_image(text) text = cls._parse_italic(text) text = cls._parse_size(text) text = cls._parse_smilies(text) text = cls._parse_spoiler(text) text = cls._parse_strike(text) text = cls._parse_underline(text) text = cls._parse_url(text) text = cls._parse_youtube(text) text = cls._parse_profile(text) # 换行处理 text = text.replace("\n", "
") return text @classmethod def _parse_audio(cls, text: str) -> str: """解析 [audio] 标签""" pattern = r"\[audio\]([^\[]+)\[/audio\]" def replace_audio(match): url = match.group(1).strip() return f'' return re.sub(pattern, replace_audio, text, flags=re.IGNORECASE) @classmethod def _parse_bold(cls, text: str) -> str: """解析 [b] 标签""" text = re.sub(r"\[b\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/b\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_box(cls, text: str) -> str: """解析 [box] 和 [spoilerbox] 标签""" # [box=title] 格式 pattern = r"\[box=([^\]]+)\](.*?)\[/box\]" def replace_box_with_title(match): title = match.group(1) content = match.group(2) return ( f"
" f"" f"
{content}
" ) text = re.sub(pattern, replace_box_with_title, text, flags=re.DOTALL | re.IGNORECASE) # [spoilerbox] 格式 pattern = r"\[spoilerbox\](.*?)\[/spoilerbox\]" def replace_spoilerbox(match): content = match.group(1) return ( f"
" f"" f"
{content}
" ) return re.sub(pattern, replace_spoilerbox, text, flags=re.DOTALL | re.IGNORECASE) @classmethod def _parse_centre(cls, text: str) -> str: """解析 [centre] 标签""" text = re.sub(r"\[centre\]", "
", text, flags=re.IGNORECASE) text = re.sub(r"\[/centre\]", "
", text, flags=re.IGNORECASE) text = re.sub(r"\[center\]", "
", text, flags=re.IGNORECASE) text = re.sub(r"\[/center\]", "
", text, flags=re.IGNORECASE) return text @classmethod def _parse_code(cls, text: str) -> str: """解析 [code] 标签""" pattern = r"\[code\]\n*(.*?)\n*\[/code\]" return re.sub(pattern, r"
\1
", text, flags=re.DOTALL | re.IGNORECASE) @classmethod def _parse_colour(cls, text: str) -> str: """解析 [color] 标签""" pattern = r"\[color=([^\]]+)\](.*?)\[/color\]" return re.sub(pattern, r'\2', text, flags=re.IGNORECASE) @classmethod def _parse_email(cls, text: str) -> str: """解析 [email] 标签""" # [email]email@example.com[/email] pattern1 = r"\[email\]([^\[]+)\[/email\]" text = re.sub(pattern1, r'\1', text, flags=re.IGNORECASE) # [email=email@example.com]text[/email] pattern2 = r"\[email=([^\]]+)\](.*?)\[/email\]" text = re.sub(pattern2, r'\2', text, flags=re.IGNORECASE) return text @classmethod def _parse_heading(cls, text: str) -> str: """解析 [heading] 标签""" pattern = r"\[heading\](.*?)\[/heading\]" return re.sub(pattern, r"

\1

", text, flags=re.IGNORECASE) @classmethod def _parse_image(cls, text: str) -> str: """解析 [img] 标签""" pattern = r"\[img\]([^\[]+)\[/img\]" def replace_image(match): url = match.group(1).strip() # TODO: 可以在这里添加图片代理支持 # 生成带有懒加载的图片标签 return f'' return re.sub(pattern, replace_image, text, flags=re.IGNORECASE) @classmethod def _parse_imagemap(cls, text: str) -> str: """ 解析 [imagemap] 标签 基于 osu-web BBCodeFromDB.php 的实现 """ pattern = r"\[imagemap\]\s*\n([^\s\n]+)\s*\n((?:[0-9.]+ [0-9.]+ [0-9.]+ [0-9.]+ (?:#|https?://[^\s]+|mailto:[^\s]+)[^\n]*\n?)+)\[/imagemap\]" def replace_imagemap(match): image_url = match.group(1).strip() links_data = match.group(2).strip() if not links_data: return f'' # 解析链接数据 links = [] for line in links_data.split("\n"): line = line.strip() if not line: continue # 按空格分割,最多分成6部分(前5个是数字和URL,第6个是标题) parts = line.split(" ", 5) if len(parts) >= 5: try: left = float(parts[0]) top = float(parts[1]) width = float(parts[2]) height = float(parts[3]) href = parts[4] # 标题可能包含空格,所以重新组合 title = parts[5] if len(parts) > 5 else "" # 构建样式 style = f"left: {left}%; top: {top}%; width: {width}%; height: {height}%;" if href == "#": # 无链接区域 links.append(f'') else: # 有链接区域 links.append( f'' ) except (ValueError, IndexError): continue if links: links_html = "".join(links) # 基于官方实现的图片标签 image_html = ( f'' ) # 使用imagemap容器 return f'
{image_html}{links_html}
' else: return f'' return re.sub(pattern, replace_imagemap, text, flags=re.DOTALL | re.IGNORECASE) @classmethod def _parse_italic(cls, text: str) -> str: """解析 [i] 标签""" text = re.sub(r"\[i\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/i\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_inline_code(cls, text: str) -> str: """解析 [c] 内联代码标签""" text = re.sub(r"\[c\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/c\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_list(cls, text: str) -> str: """解析 [list] 标签""" # 有序列表 pattern = r"\[list=1\](.*?)\[/list\]" text = re.sub(pattern, r"
    \1
", text, flags=re.DOTALL | re.IGNORECASE) # 无序列表 pattern = r"\[list\](.*?)\[/list\]" text = re.sub(pattern, r"
    \1
", text, flags=re.DOTALL | re.IGNORECASE) # 列表项 pattern = r"\[\*\]\s*(.*?)(?=\[\*\]|\[/list\]|$)" text = re.sub(pattern, r"
  • \1
  • ", text, flags=re.DOTALL | re.IGNORECASE) return text @classmethod def _parse_notice(cls, text: str) -> str: """解析 [notice] 标签""" pattern = r"\[notice\]\n*(.*?)\n*\[/notice\]" return re.sub(pattern, r'
    \1
    ', text, flags=re.DOTALL | re.IGNORECASE) @classmethod def _parse_profile(cls, text: str) -> str: """解析 [profile] 标签""" pattern = r"\[profile(?:=(\d+))?\](.*?)\[/profile\]" def replace_profile(match): user_id = match.group(1) username = match.group(2) if user_id: return f'{username}' else: return f'@{username}' return re.sub(pattern, replace_profile, text, flags=re.IGNORECASE) @classmethod def _parse_quote(cls, text: str) -> str: """解析 [quote] 标签""" # [quote="author"]content[/quote] pattern1 = r'\[quote="([^"]+)"\]\s*(.*?)\s*\[/quote\]' text = re.sub(pattern1, r"

    \1 wrote:

    \2
    ", text, flags=re.DOTALL | re.IGNORECASE) # [quote]content[/quote] pattern2 = r"\[quote\]\s*(.*?)\s*\[/quote\]" text = re.sub(pattern2, r"
    \1
    ", text, flags=re.DOTALL | re.IGNORECASE) return text @classmethod def _parse_size(cls, text: str) -> str: """解析 [size] 标签""" def replace_size(match): size = int(match.group(1)) # 限制字体大小范围 (30-200%) size = max(30, min(200, size)) return f'' pattern = r"\[size=(\d+)\]" text = re.sub(pattern, replace_size, text, flags=re.IGNORECASE) text = re.sub(r"\[/size\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_smilies(cls, text: str) -> str: """解析表情符号标签""" # 处理 phpBB 风格的表情符号标记 pattern = r"" return re.sub(pattern, r'", text, flags=re.IGNORECASE) text = re.sub(r"\[/spoiler\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_strike(cls, text: str) -> str: """解析 [s] 和 [strike] 标签""" text = re.sub(r"\[s\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/s\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[strike\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/strike\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_underline(cls, text: str) -> str: """解析 [u] 标签""" text = re.sub(r"\[u\]", "", text, flags=re.IGNORECASE) text = re.sub(r"\[/u\]", "", text, flags=re.IGNORECASE) return text @classmethod def _parse_url(cls, text: str) -> str: """解析 [url] 标签""" # [url]http://example.com[/url] pattern1 = r"\[url\]([^\[]+)\[/url\]" text = re.sub(pattern1, r'\1', text, flags=re.IGNORECASE) # [url=http://example.com]text[/url] pattern2 = r"\[url=([^\]]+)\](.*?)\[/url\]" text = re.sub(pattern2, r'\2', text, flags=re.IGNORECASE) return text @classmethod def _parse_youtube(cls, text: str) -> str: """解析 [youtube] 标签""" pattern = r"\[youtube\]([a-zA-Z0-9_-]{11})\[/youtube\]" def replace_youtube(match): video_id = match.group(1) return ( f"" ) return re.sub(pattern, replace_youtube, text, flags=re.IGNORECASE) @classmethod def sanitize_html(cls, html_content: str) -> str: """ 清理HTML内容,移除危险标签和属性 基于 osu-web 的安全策略 Args: html_content: 要清理的HTML内容 Returns: 清理后的安全HTML """ if not html_content: return "" # 使用bleach清理HTML,配置CSS清理器以允许安全的样式 css_sanitizer = CSSSanitizer( allowed_css_properties=[ "color", "background", "background-color", "font-size", "font-weight", "font-style", "text-decoration", "text-align", "left", "top", "width", "height", "position", "margin", "padding", "max-width", "max-height", "aspect-ratio", "z-index", "display", "border", "border-none", "cursor", ] ) cleaned = bleach.clean( html_content, tags=cls.ALLOWED_TAGS, attributes=cls.ALLOWED_ATTRIBUTES, protocols=["http", "https", "mailto"], css_sanitizer=css_sanitizer, strip=True, ) return cleaned @classmethod def process_userpage_content(cls, raw_content: str, max_length: int = 60000) -> dict[str, str]: """ 处理用户页面内容 基于 osu-web 的处理流程 Args: raw_content: 原始BBCode内容 max_length: 最大允许长度(字符数,支持多字节字符) Returns: 包含raw和html两个版本的字典 """ # 检查内容是否为空或仅包含空白字符 if not raw_content or not raw_content.strip(): raise ContentEmptyError() # 检查长度限制(Python的len()本身支持Unicode字符计数) content_length = len(raw_content) if content_length > max_length: raise ContentTooLongError(content_length, max_length) # 检查是否包含禁止的标签 content_lower = raw_content.lower() for forbidden_tag in cls.FORBIDDEN_TAGS: if f"[{forbidden_tag}" in content_lower or f"<{forbidden_tag}" in content_lower: raise ForbiddenTagError(forbidden_tag) # 转换BBCode为HTML html_content = cls.parse_bbcode(raw_content) # 清理HTML safe_html = cls.sanitize_html(html_content) # 包装在 bbcode 容器中 final_html = f'
    {safe_html}
    ' return {"raw": raw_content, "html": final_html} @classmethod def validate_bbcode(cls, content: str) -> list[str]: """ 验证BBCode语法并返回错误列表 基于 osu-web 的验证逻辑 Args: content: 要验证的BBCode内容 Returns: 错误消息列表 """ errors = [] # 检查内容是否仅包含引用(参考官方逻辑) content_without_quotes = cls._remove_block_quotes(content) if content.strip() and not content_without_quotes.strip(): errors.append("Content cannot contain only quotes") # 检查标签配对 tag_stack = [] tag_pattern = r"\[(/?)(\w+)(?:=[^\]]+)?\]" for match in re.finditer(tag_pattern, content, re.IGNORECASE): is_closing = match.group(1) == "/" tag_name = match.group(2).lower() if is_closing: if not tag_stack: errors.append(f"Closing tag '[/{tag_name}]' without opening tag") elif tag_stack[-1] != tag_name: errors.append(f"Mismatched closing tag '[/{tag_name}]', expected '[/{tag_stack[-1]}]'") else: tag_stack.pop() else: # 特殊处理自闭合标签(只有列表项 * 是真正的自闭合) if tag_name not in ["*"]: tag_stack.append(tag_name) # 检查未关闭的标签 for unclosed_tag in tag_stack: errors.append(f"Unclosed tag '[{unclosed_tag}]'") return errors @classmethod def _remove_block_quotes(cls, text: str) -> str: """ 移除引用块(参考 osu-web BBCodeFromDB::removeBlockQuotes) Args: text: 原始文本 Returns: 移除引用后的文本 """ # 基于官方实现的简化版本 # 移除 [quote]...[/quote] 和 [quote=author]...[/quote] pattern = r"\[quote(?:=[^\]]+)?\].*?\[/quote\]" result = re.sub(pattern, "", text, flags=re.DOTALL | re.IGNORECASE) return result.strip() @classmethod def remove_bbcode_tags(cls, text: str) -> str: """ 移除所有BBCode标签,只保留纯文本 用于搜索索引等场景 基于官方实现 """ # 基于官方实现的完整BBCode标签模式 pattern = ( r"\[/?(\*|\*:m|audio|b|box|color|spoilerbox|centre|center|code|email|heading|i|img|" r"list|list:o|list:u|notice|profile|quote|s|strike|u|spoiler|size|url|youtube|c)" r"(=.*?(?=:))?(:[a-zA-Z0-9]{1,5})?\]" ) return re.sub(pattern, "", text) # 服务实例 bbcode_service = BBCodeService()