Fix telegram.error.BadRequest: Can't parse entities: unsupported start tag 'li' at byte offset 1191

2025-07-12 21:20:18 +08:00
parent b48ec0a48d
commit 2a3b6c0151
1 changed files with 23 additions and 10 deletions
--- a/main.py
+++ b/main.py
@@ -31,27 +31,40 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
    
    # Simple markdown to HTML conversion
    html = markdown.markdown(response_text)
+    
+    # Debug: print the HTML before cleaning (remove this later)
+    print(f"DEBUG - Original HTML: {html}")
+    
    # Basic cleanup for Telegram HTML compatibility
    html = html.replace('<strong>', '<b>').replace('</strong>', '</b>')
    html = html.replace('<em>', '<i>').replace('</em>', '</i>')
-    html = re.sub(r'<p>(.*?)</p>', r'\1\n\n', html, flags=re.DOTALL)
    
-    # Handle lists - convert to simple bullet points
-    html = re.sub(r'<ul>\s*', '', html)
-    html = re.sub(r'\s*</ul>', '', html)
-    html = re.sub(r'<li>(.*?)</li>', r'• \1\n', html, flags=re.DOTALL)
+    # Remove ALL list-related tags more aggressively
+    html = re.sub(r'</?ul[^>]*>', '', html, flags=re.IGNORECASE)
+    html = re.sub(r'</?ol[^>]*>', '', html, flags=re.IGNORECASE)
+    html = re.sub(r'<li[^>]*>(.*?)</li>', r'• \1\n', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'</?li[^>]*>', '', html, flags=re.IGNORECASE)  # Remove any remaining li tags
    
-    # Clean up any other unsupported tags that might be present
-    html = re.sub(r'<ol>\s*', '', html)
-    html = re.sub(r'\s*</ol>', '', html)
-    html = re.sub(r'<h[1-6]>(.*?)</h[1-6]>', r'<b>\1</b>\n', html, flags=re.DOTALL)
+    # Handle paragraphs
+    html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'</?p[^>]*>', '', html, flags=re.IGNORECASE)  # Remove any remaining p tags
+    
+    # Handle headers
+    html = re.sub(r'<h[1-6][^>]*>(.*?)</h[1-6]>', r'<b>\1</b>\n', html, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Remove any other potentially problematic tags
+    html = re.sub(r'</?div[^>]*>', '', html, flags=re.IGNORECASE)
+    html = re.sub(r'</?span[^>]*>', '', html, flags=re.IGNORECASE)
    
    # Clean up extra whitespace
    html = re.sub(r'\n\s*\n', '\n\n', html)
    html = html.strip()
    
+    # Debug: print the cleaned HTML (remove this later)
+    print(f"DEBUG - Cleaned HTML: {html}")
+    
    # Send the formatted message using HTML parse mode
-    await update.message.reply_text(html.strip(), parse_mode='HTML')
+    await update.message.reply_text(html, parse_mode='HTML')

 if __name__ == "__main__":